Recipe Rating Analysis: Nutritional Values and Ingredients - Full Analysis
1 Data Description and Preparation
1.1 Loading data
Here we load the dataset and do some cleaning and processing. We standardise the variables names and add an ID column to have a unique identifier for each recipe.
#loading the data
recipes_raw <- read.csv(here("data/epi_r.csv"))
recipes <- recipes_raw%>%
clean_names() %>%
mutate(ID = 1:nrow(.)) %>%
select(ID, everything())1.2 Data Description
tibble(Variables = c("**ID**", "**title**", "**rating**", "**calories**", "**protein**", "**fat**", "**sodium**", "**674 binary variables**"), Meaning = c("Unique ID", "Recipe name", "Rating of the recipe", "Calories contained in the recipe", "Protein contained in the recipe (grams)","Fat contained in the recipe (grams)", "Sodium contained in the recipe (milligrams)", "Binary variables, incl. ingredients, types of recipes, US States, diet preferences, etc."), Variable_Type = c("Character", "Character", "Categorical", "Numerical", "Numerical", "Numerical", "Numerical", "Binary"))%>%
kbl()%>%
kable_styling(position = "center")| Variables | Meaning | Variable_Type |
|---|---|---|
| ID | Unique ID | Character |
| title | Recipe name | Character |
| rating | Rating of the recipe | Categorical |
| calories | Calories contained in the recipe | Numerical |
| protein | Protein contained in the recipe (grams) | Numerical |
| fat | Fat contained in the recipe (grams) | Numerical |
| sodium | Sodium contained in the recipe (milligrams) | Numerical |
| 674 binary variables | Binary variables, incl. ingredients, types of recipes, US States, diet preferences, etc. | Binary |
1.3 Example of the data
The interactive data table below shows the first 100 recipes from our dataset and can be used to explore the dataset a get a feel of how it is constructed. We have the ID, the recipe title, the 4 nutritional values and then the binary variables relating to various things such as ingredients, diet preferences, or countrie names to cite a few.
datatable(recipes %>% head(100), options = list(pageLength = 10))1.4 Classifying variables into categories
Given the high amount of variables that we had (680), we decided that we needed to somewhat create categories to aggregate them and be able to use them more easily.
We have decided to classify variables into the correct categories manually instead of wasting time trying to automate the process, and having to check manually afterwards anyway to make sure that the classification was done correctly.
us_states <- c("alabama", "alaska", "arizona", "california", "colorado", "connecticut", "florida", "georgia", "hawaii", "idaho", "illinois", "indiana", "iowa", "kansas", "kentucky", "louisiana", "maine", "maryland", "massachusetts", "michigan", "minnesota", "mississippi", "missouri", "nebraska", "new_hampshire", "new_jersey", "new_mexico", "new_york", "north_carolina", "ohio", "oklahoma", "oregon", "pennsylvania", "rhode_island", "south_carolina", "tennessee", "texas", "utah", "vermont", "virginia", "washington", "west_virginia", "wisconsin")
us_cities <- c("aspen", "atlanta", "beverly_hills", "boston","brooklyn", "buffalo", "cambridge", "chicago", "columbus", "costa_mesa", "dallas", "denver", "healdsburg", "hollywood", "houston", "kansas_city", "lancaster", "las_vegas", "london", "los_angeles", "louisville", "miami", "minneapolis", "new_orleans", "pacific_palisades", "paris", "pasadena", "pittsburgh", "portland", "providence", "san_francisco", "santa_monica", "seattle", "st_louis", "washington_d_c", "westwood", "yonkers")
countries <- c("australia", "bulgaria", "canada", "chile", "cuba", "dominican_republic", "egypt", "england", "france", "germany", "guam", "haiti", "ireland", "israel", "italy", "jamaica", "japan", "mexico", "mezcal", "peru", "philippines", "spain", "switzerland")
alcohol <- c("alcoholic", "amaretto", "beer", "bitters", "bourbon", "brandy", "calvados", "campari", "chambord", "champagne", "chartreuse", "cocktail", "cognac_armagnac", "creme_de_cacao", "digestif", "eau_de_vie", "fortified_wine", "frangelico", "gin", "grand_marnier", "grappa", "kahlua", "kirsch", "liqueur", "long_beach", "margarita", "marsala", "martini", "midori", "pernod", "port", "punch", "red_wine", "rum", "sake", "sangria", "scotch", "sherry", "sparkling_wine", "spirit", "spritzer", "tequila", "triple_sec", "vermouth", "vodka", "whiskey", "white_wine", "wine")
others <- c("bon_appetit", "bon_app_tit", "condiment_spread", "cr_me_de_cacao", "epi_ushg", "flaming_hot_summer", "frankenrecipe", "harpercollins", "house_garden", "no_meat_no_problem", "parade", "sandwich_theory", "self", "shower", "tested_improved", "windsor", "weelicious", "snack_week", "tailgating", "quick_and_healthy", "picnic", "kitchen_olympics", "house_cocktail", "hors_d_oeuvre", "frozen_dessert", "freezer_food", "edible_gift", "cookbook_critic", "cook_like_a_diner", "condiment", "cocktail_party", "camping", "buffet", "x30_days_of_groceries", "x_cakeweek", "x_wasteless", "x22_minute_meals", "x3_ingredient_recipes")
chef <- c("anthony_bourdain", "dorie_greenspan", "emeril_lagasse", "nancy_silverton", "suzanne_goin")
interesting <- c("advance_prep_required", "entertaining", "epi_loves_the_microwave", "friendsgiving", "game", "gourmet", "healthy", "high_fiber", "hot_drink", "kid_friendly", "kidney_friendly", "microwave", "no_cook", "one_pot_meal", "oscars", "paleo", "pescatarian", "poker_game_night", "potluck", "quick_easy", "cookbooks", "leftovers")
seasons_vec <- c("winter", "spring", "summer", "fall")
celebrations <- c("anniversary", "back_to_school", "bastille_day", "birthday", "christmas", "christmas_eve", "cinco_de_mayo", "date", "diwali", "easter", "engagement_party", "family_reunion", "father_s_day", "fourth_of_july", "graduation", "halloween", "hanukkah", "kentucky_derby", "kwanzaa", "labor_day", "lunar_new_year", "mardi_gras", "mother_s_day", "new_year_s_day", "new_year_s_eve", "oktoberfest", "party", "passover", "persian_new_year", "purim", "ramadan", "rosh_hashanah_yom_kippur", "shavuot", "st_patrick_s_day", "sukkot", "super_bowl", "thanksgiving", "valentine_s_day", "wedding")
drink_no_alcohol_vec <- c("apple_juice", "fruit_juice", "iced_tea", "lemon_juice", "lime_juice", "orange_juice", "pomegranate_juice", "tea")
tools <- c("coffee_grinder", "double_boiler", "food_processor", "ice_cream_machine", "juicer", "mandoline", "mixer", "mortar_and_pestle", "pasta_maker", "ramekin", "skewer", "slow_cooker", "smoker", "wok", "blender", "candy_thermometer", "pressure_cooker")
cooking_techniques <- c("raw", "saute", "freeze_chill", "fry", "stir_fry", "simmer", "boil", "broil", "bake", "braise", "chill", "deep_fry", "steam", "rub", "roast", "poach", "pan_fry", "marinate", "grill_barbecue", "grill")
nutritional_values <- c("calories", "protein", "fat", "sodium")
recipe_type_vec <- c("aperitif", "appetizer", "breakfast", "brunch", "dessert", "dinner", "lunch", "side", "snack")
diet_preferences_vec <- c("dairy_free", "fat_free", "kosher","kosher_for_passover", "low_cal", "low_cholesterol", "low_carb", "low_fat", "low_sodium", "low_sugar", "low_no_sugar", "non_alcoholic", "no_sugar_added", "organic", "peanut_free", "soy_free", "sugar_conscious", "tree_nut_free", "vegan", "vegetarian", "wheat_gluten_free")
### Ingredients
#low level categories
vegetables_vec <- c("artichoke", "arugula", "asparagus", "butternut_squash", "bean", "beet", "bell_pepper", "bok_choy", "broccoli", "broccoli_rabe", "brussel_sprout", "cabbage", "capers", "carrot", "cauliflower", "celery", "chard", "chile_pepper", "collard_greens", "corn", "cucumber", "eggplant", "endive", "escarole", "fennel", "garlic", "ginger", "green_bean", "green_onion_scallion", "horseradish", "jerusalem_artichoke", "jicama", "kale", "leafy_green", "leek", "legume", "lentil", "lettuce", "lima_bean", "mushroom", "mustard_greens", "okra", "onion", "parsnip", "pea", "pickles", "poblano", "pumpkin", "radicchio", "radish", "root_vegetable", "rutabaga", "salad", "shallot", "soy", "spinach", "squash", "sugar_snap_pea", "tapioca", "tomatillo", "tomato", "turnip", "watercress", "yellow_squash", "yuca", "zucchini")
pork_meat_vec <- c("bacon", "ham", "pork", "pork_chop", "pork_rib", "pork_tenderloin", "prosciutto")
lamb_meat_vec <- c("ground_lamb", "lamb", "lamb_chop", "lamb_shank", "rack_of_lamb")
beef_meat_vec <- c("beef", "beef_rib", "beef_shank", "beef_tenderloin", "brisket", "ground_beef", "hamburger", "veal")
meat_with_wings_vec <- c("chicken", "duck", "goose", "poultry", "poultry_sausage", "quail", "turkey")
meat_various_vec <- c("meatball", "meatloaf", "rabbit", "sausage", "steak", "venison")
# stuff_in_the_water <- c("anchovy", "bass", "caviar", "clam", "cod", "crab", "fish", "halibut", "lobster", "mussel", "octopus", "oyster", "salmon", "sardine", "scallop", "seafood", "shellfish", "shrimp", "snapper", "squid", "swordfish", "tilapia", "trout", "tuna")
seafood_vec <- c("clam", "crab", "lobster", "mussel", "octopus", "oyster", "scallop", "shellfish", "shrimp", "squid")
fish_vec <- c("anchovy", "bass", "caviar", "cod", "halibut", "salmon", "sardine", "snapper", "swordfish", "tilapia", "trout", "tuna")
herbs_vec <- c("anise", "basil", "chive", "cilantro", "coriander", "dill", "lemongrass", "mint", "oregano", "parsley", "rosemary", "sage", "tarragon", "thyme")
nuts_vec <- c("almond", "cashew", "chestnut", "hazelnut", "macadamia_nut", "peanut", "pecan", "pine_nut", "pistachio", "tree_nut", "walnut")
cereals_vec <- c("barley", "bran", "bulgur", "grains", "granola", "oat", "quinoa", "rye", "whole_wheat")
carbs_vec <- c("brown_rice", "chickpea", "cornmeal", "couscous", "hominy_cornmeal_masa", "orzo", "pasta", "potato", "rice", "semolina", "sweet_potato_yam", "wild_rice")
fruits_vec <- c("apple", "apricot", "asian_pear", "avocado", "banana", "berry", "blackberry", "blueberry", "cantaloupe", "cherry", "citrus", "coconut", "cranberry", "currant", "dried_fruit", "fig", "grape", "grapefruit", "guava", "honeydew", "kiwi", "kumquat", "lemon", "lime", "lingonberry", "lychee", "mango", "melon", "nectarine", "olive", "orange", "papaya", "passion_fruit", "peach", "pear", "persimmon", "pineapple", "plantain", "plum", "pomegranate", "prune", "quince", "raisin", "raspberry", "rhubarb", "strawberry", "tamarind", "tangerine", "tropical_fruit", "watermelon")
dessert_vec <- c("biscuit", "brownie", "butterscotch_caramel", "cake", "candy", "chocolate", "cobbler_crumble", "cookie", "cookies", "cranberry_sauce", "crepe", "cupcake", "honey", "jam_or_jelly", "maple_syrup", "marshmallow", "muffin","pancake", "pastry", "pie", "smoothie", "sorbet", "souffle_meringue", "waffle")
cheeses_vec <- c("blue_cheese", "brie", "cheddar", "cottage_cheese", "cream_cheese", "feta", "fontina", "goat_cheese", "gouda", "monterey_jack", "mozzarella", "parmesan", "ricotta", "swiss_cheese")
dairy_vec <- c("butter", "buttermilk", "custard", "egg_nog", "ice_cream", "marscarpone", "milk_cream", "sour_cream", "yogurt")
spices_vec <- c("caraway", "cardamom", "chili", "cinnamon", "clove", "cumin", "curry", "hot_pepper", "jalapeno", "marinade", "nutmeg", "paprika", "pepper", "poppy", "saffron", "sesame", "sesame_oil", "soy_sauce", "vanilla", "wasabi")
#top level categories
general_categories <- c("vegetable", "meat", "fish", "seafood", "herb", "nut", "fruit", "drink", "cheese", "dairy", "spice")#using this to select the columns in ingredients_df and we could also use it later of for the for loop
all_meats <- c(beef_meat_vec, pork_meat_vec, lamb_meat_vec, meat_with_wings_vec, meat_various_vec)
all_fish_seafood <- c(fish_vec, seafood_vec)
all_ingredients <- c(vegetables_vec, all_meats, all_fish_seafood, herbs_vec, nuts_vec, cereals_vec, carbs_vec, fruits_vec, drink_no_alcohol_vec, dessert_vec, cheeses_vec, dairy_vec, spices_vec, "egg")
#stuff which isn't ingredients and that we need to sort
meals <- c("backyard_bbq", "bread", "breadcrumbs", "brine", "burrito", "casserole_gratin", "coffee", "flat_bread", "hummus", "iced_coffee", "lasagna", "macaroni_and_cheese", "mayonnaise", "mustard", "noodle", "oatmeal", "omelet", "peanut_butter", "pizza", "pot_pie", "potato_salad", "quiche", "rose", "salad_dressing", "salsa", "sandwich", "sauce", "seed", "soup_stew", "stew", "stock", "stuffing_dressing", "taco", "tart", "tofu", "tortillas", "vinegar", "frittata", "molasses", "sourdough", "fritter", "phyllo_puff_pastry_dough", "dip")2 Data Understanding
2.1 Structure and summary
# Now let's see the structure of our data
recipes %>%
head(20) %>%
str() #> 'data.frame': 20 obs. of 681 variables:
#> $ ID : int 1 2 3 4 5 6 7 8 9 10 ...
#> $ title : chr "Lentil, Apple, and Turkey Wrap "..
#> $ rating : num 2.5 4.38 3.75 5 3.12 ...
#> $ calories : num 426 403 165 NA 547 948 NA NA 170 ..
#> $ protein : num 30 18 6 NA 20 19 NA NA 7 23 ...
#> $ fat : num 7 23 7 NA 32 79 NA NA 10 41 ...
#> $ sodium : num 559 1439 165 NA 452 ...
#> $ x_cakeweek : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ x_wasteless : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ x22_minute_meals : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ x3_ingredient_recipes : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ x30_days_of_groceries : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ advance_prep_required : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ alabama : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ alaska : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ alcoholic : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ almond : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ amaretto : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ anchovy : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ anise : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ anniversary : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ anthony_bourdain : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ aperitif : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ appetizer : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ apple : num 1 0 0 0 0 0 0 0 0 0 ...
#> $ apple_juice : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ apricot : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ arizona : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ artichoke : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ arugula : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ asian_pear : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ asparagus : num 0 0 0 0 0 0 1 0 0 0 ...
#> $ aspen : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ atlanta : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ australia : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ avocado : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ back_to_school : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ backyard_bbq : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ bacon : num 0 0 0 0 0 1 0 0 0 0 ...
#> $ bake : num 0 1 0 0 1 0 0 0 0 0 ...
#> $ banana : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ barley : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ basil : num 0 0 0 0 0 1 0 0 0 0 ...
#> $ bass : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ bastille_day : num 0 1 0 0 0 0 0 0 0 0 ...
#> $ bean : num 1 0 0 0 0 0 0 0 0 0 ...
#> $ beef : num 0 0 0 0 0 0 0 0 1 0 ...
#> $ beef_rib : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ beef_shank : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ beef_tenderloin : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ beer : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ beet : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ bell_pepper : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ berry : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ beverly_hills : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ birthday : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ biscuit : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ bitters : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ blackberry : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ blender : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ blue_cheese : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ blueberry : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ boil : num 0 0 0 0 0 0 1 0 0 0 ...
#> $ bok_choy : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ bon_appetit : num 0 1 0 1 1 1 1 0 0 0 ...
#> $ bon_app_tit : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ boston : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ bourbon : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ braise : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ bran : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ brandy : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ bread : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ breadcrumbs : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ breakfast : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ brie : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ brine : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ brisket : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ broccoli : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ broccoli_rabe : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ broil : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ brooklyn : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ brown_rice : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ brownie : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ brunch : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ brussel_sprout : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ buffalo : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ buffet : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ bulgaria : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ bulgur : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ burrito : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ butter : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ buttermilk : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ butternut_squash : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ butterscotch_caramel : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ cabbage : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ cake : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ california : num 0 0 0 0 1 0 0 0 0 0 ...
#> $ calvados : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ cambridge : num 0 0 0 0 0 0 0 0 0 0 ...
#> [list output truncated]
# We have only numerical variables, but in reality just 4 variables could be considered as such. More in particular, "rating", "calories", "protein", "fat" and "sodium" could be considered numerical. The other variables should be considered categorical since they allow only for 0 or 1 values.
# Let's have a different look at the data with the summary function.
recipes %>%
select(rating, calories, protein, fat, sodium) %>%
dfSummary(style = "grid")#> Data Frame Summary
#> recipes
#> Dimensions: 20052 x 5
#> Duplicates: 5576
#>
#> +----+-----------+---------------------------+----------------------+-----------+----------+---------+
#> | No | Variable | Stats / Values | Freqs (% of Valid) | Graph | Valid | Missing |
#> +====+===========+===========================+======================+===========+==========+=========+
#> | 1 | rating | Mean (sd) : 3.7 (1.3) | 0.00 : 1836 ( 9.2%) | I | 20052 | 0 |
#> | | [numeric] | min < med < max: | 1.25 : 164 ( 0.8%) | | (100.0%) | (0.0%) |
#> | | | 0 < 4.4 < 5 | 1.88!: 124 ( 0.6%) | | | |
#> | | | IQR (CV) : 0.6 (0.4) | 2.50 : 532 ( 2.7%) | | | |
#> | | | | 3.12!: 1489 ( 7.4%) | I | | |
#> | | | | 3.75 : 5169 (25.8%) | IIIII | | |
#> | | | | 4.38!: 8019 (40.0%) | IIIIIII | | |
#> | | | | 5.00 : 2719 (13.6%) | II | | |
#> | | | | ! rounded | | | |
#> +----+-----------+---------------------------+----------------------+-----------+----------+---------+
#> | 2 | calories | Mean (sd) : 6323 (359046) | 1858 distinct values | : | 15935 | 4117 |
#> | | [numeric] | min < med < max: | | : | (79.5%) | (20.5%) |
#> | | | 0 < 331 < 30111218 | | : | | |
#> | | | IQR (CV) : 388 (56.8) | | : | | |
#> | | | | | : | | |
#> +----+-----------+---------------------------+----------------------+-----------+----------+---------+
#> | 3 | protein | Mean (sd) : 100.2 (3840) | 282 distinct values | : | 15890 | 4162 |
#> | | [numeric] | min < med < max: | | : | (79.2%) | (20.8%) |
#> | | | 0 < 8 < 236489 | | : | | |
#> | | | IQR (CV) : 24 (38.3) | | : | | |
#> | | | | | : | | |
#> +----+-----------+---------------------------+----------------------+-----------+----------+---------+
#> | 4 | fat | Mean (sd) : 346.9 (20456) | 326 distinct values | : | 15869 | 4183 |
#> | | [numeric] | min < med < max: | | : | (79.1%) | (20.9%) |
#> | | | 0 < 17 < 1722763 | | : | | |
#> | | | IQR (CV) : 26 (59) | | : | | |
#> | | | | | : | | |
#> +----+-----------+---------------------------+----------------------+-----------+----------+---------+
#> | 5 | sodium | Mean (sd) : 6226 (333318) | 2434 distinct values | : | 15933 | 4119 |
#> | | [numeric] | min < med < max: | | : | (79.5%) | (20.5%) |
#> | | | 0 < 294 < 27675110 | | : | | |
#> | | | IQR (CV) : 631 (53.5) | | : | | |
#> | | | | | : | | |
#> +----+-----------+---------------------------+----------------------+-----------+----------+---------+
# We can already see for instance that the majority of the values of the variable "rating" are 4.38 (40% of the total). Moreover, we observe that the variables "calories", "protein", "fat" and "sodium" have roughly 20% of missing values.3 Data Cleaning
3.1 Analysis of NAs
#plot of missing values for each variable
recipes %>%
select(rating, all_of(nutritional_values)) %>%
gg_miss_var()+
labs(title = "Number of NA Values for Rating and the nutritional Variables")#we use the temp_df each time we want to create a temporary df for a single analysis and we know we won't reuse that dataframe later on
temp_df <- recipes %>%
select(title, all_of(nutritional_values))
na_obs <- which(rowSums(is.na(temp_df)) > 0)
# subset the original dataframe to only include rows with NA values
df_na <- temp_df[na_obs, ]
# print the result
#df_na
# count the number of NAs for each row
na_count <- rowSums(is.na(df_na))
# count the frequency of NA counts
freq_table_na <- table(na_count)
freq_na <- as.data.frame(freq_table_na) %>%
mutate(na_count = as.character(na_count))
freq_na %>%
ggplot(aes(x=na_count, y=Freq)) +
geom_bar(stat="identity") +
xlab("Number of NAs") +
ylab("Frequency") +
labs(title ="Number of NAs in nutritional values per recipe", subtitle = "NAs among variables calories, protein, fat, sodium") +
coord_flip()recipes <- recipes %>%
drop_na()Among the recipes which have NAs, we notice that many of them have 4 NAs for all the 4 nutritional values, more precisely 4117 out of 4188 recipes. Without any other information available, making an imputation to retrieve such values would not make any sense. Interestingly, we do not observe 3 contemporary NAs for recipes.
We could try to make an imputation of the 29 recipes that have only 1 NA. The same operation on the 42 recipes with 2 NAs would not deliver accurate and satisfying results. However, we believe that is not worth to make imputation of such NA values. We should not forget that the nutritional values per recipe are estimated, then making an imputation would result in a sort of estimation of an estimation. To what extent could it be reliable? We decide to eliminate recipes with NAs for nutritional values. Nutritional values represent a crucial information for our analysis.
Finally, we would still have 15864 recipes without NAs.
3.2 Eliminate recipes with rating equal to zero
rating_count <- table(recipes$rating) %>%
as.data.frame() %>%
rename(rating = Var1,
frequency = Freq)
recipes <- recipes %>%
filter(rating != 0)There are 1296 recipes which have rating equal to zero. Some of those might be unpopular, others might be too recent to have a rating. For the purpose of our analysis, we decide to eliminate these specific recipes.
We are left with 14568 observations after removing NAs and obs with a 0 rating value.
3.3 Discard copies of recipes
We want to eliminate recipes that have multiple copies. Sometimes the recipes have the same title, but nutritional values are different. This indicates that there are various ways to prepare a specific recipe. We want to keep those recipes that have the same title, but have different nutritional values.
Let’s check for instance Almond Butter Crisps, a recipe which can be found twice in the data set, with ID=1026 and ID=8908.
# recipes %>%
# filter(ID == 1026)
unique_recipes <- distinct(recipes, title, rating, protein, sodium, fat, calories, .keep_all = TRUE)
# unique_recipes %>%
# filter(ID == 8908)
recipes <- unique_recipesNow the data set is free from useless copies. We discarded 1288 copies in total. We lose a bit less if we remove the ones without specific ingredients first, meaning that some duplicate copies don’t contain specific ingredients either.
3.4 Removing recipes without specific ingredients listed
Here we are facing a challenge regarding the general ingredient categories. Indeed, when doing computations on the binary columns, there is no issue since, whether the recipe contains specific ingredients in a category, or only a 1 in the general category, then that information is captured in the corresponding binary column.
However, if we want to compute the total number of ingredients in each category that is present in recipes, then we are facing problems. To illustrate, let’s assume that we have a recipe which contains 3 vegetables (specific columns in the vegetables_vec). In addition, for that recipe, the general column is also a 1 –> then by summing up, we get 4 ingredients when it should be 3.
Another problem is related to recipes for which the only column in a category (e.g., vegetables) that has a 1 is the general category (i.e., vegetable), and there isn’t any specific ingredient listed within the vegetable category (in vegetables_vec) –> this can lead to issues when counting the number of specific ingredients per category.
In order to decide whether we want to analyse with or without general categories, let’s see how many observations would remain if we remove all the obs for which we have a general category at 1, and all specific ingredients in that category is set to 0.
#this filters out the observations which have 1 for general category and 0s for every ingredient in that category
recipes <- recipes %>%
filter(!(if_all(all_of(vegetables_vec), ~.x == 0) & vegetable == 1)) %>%
filter(!(if_all(all_of(all_meats), ~.x == 0) & meat == 1)) %>%
filter(!(if_all(all_of(fish_vec), ~.x == 0) & fish == 1)) %>%
filter(!(if_all(all_of(seafood_vec), ~.x == 0) & seafood == 1)) %>%
filter(!(if_all(all_of(herbs_vec), ~.x == 0) & herb == 1)) %>%
filter(!(if_all(all_of(nuts_vec), ~.x == 0) & nut == 1)) %>%
filter(!(if_all(all_of(fruits_vec), ~.x == 0) & fruit == 1)) %>%
filter(!(if_all(all_of(drink_no_alcohol_vec), ~.x == 0) & drink == 1)) %>%
filter(!(if_all(all_of(cheeses_vec), ~.x == 0) & cheese == 1)) %>%
filter(!(if_all(all_of(dairy_vec), ~.x == 0) & dairy == 1)) %>%
filter(!(if_all(all_of(spices_vec), ~.x == 0) & spice == 1))We are left with 10321 obs after removing all recipes which have no specific ingredient in at least one category, while that category general variable is at 1.
4 EDA
4.1 Nutrution EDA
4.1.1 Visual exploration - Univariate Analysis
4.1.1.1 Rating Barplot
recipes %>%
ggplot(aes(x=as.factor(rating), fill=as.factor(rating) )) +
geom_bar( ) +
scale_fill_manual(values = c("red4", "red3", "orangered", "orange", "gold", "greenyellow", "green3", "green4") ) +
theme(legend.position="none") +
scale_y_continuous(breaks=seq(0,10000,1000)) +
labs(x = "Rating", y = "Number of recipes",
title = "Overview of recipes' ratings")
The data available provide ratings which are separated in 7 distinct
categories, which span from 1.25 to 5. We do not forget that we decided
to exclude recipes with rating equal to zero. As we can see, most of the
ratings have value equal or above 3.75, more in particular we notice
that most of the recipes have ratings of 4.375.
4.1.1.2 Calories - Boxplot and Histogram
recipes_plot <- recipes %>%
pivot_longer(cols = c(calories, protein, fat, sodium),
names_to = "nutrition",
values_to = "n_value")
# Calories boxplot not filtered
recipes_plot %>%
filter(nutrition == "calories") %>%
ggplot(aes(x=nutrition, y=n_value, fill=nutrition)) +
geom_boxplot() +
scale_fill_viridis(discrete = TRUE, alpha=0.6, option="A") +
theme_light() +
theme(legend.position="none",
plot.title = element_text(size=11)) +
ggtitle("Boxplot of calories nutritional value") +
xlab("") +
ylab("Value")We notice that there are recipes with more than 30’000’000 calories which are clearly outliers. It is also hard to interpret these values from the boxplot and even with a density plot we cannot extract any insight. We must then discard those values in order to continue with a meaningful analysis. There are 28 recipes which have more than 7000 calories. We consider those as extreme values which represents 0.27% of the data available. Why do they exist? It could be due to a miscalculation or to an excessive number of servings per recipe. By evaluating the usual number of calories per recipe, we decided to keep those that have a reasonable quantity, i.e., below 7000.
recipes_plot %>%
filter(nutrition == "calories") %>%
select(title, nutrition, n_value) %>%
arrange(desc(n_value)) #> # A tibble: 10,321 x 3
#> title nutrition n_value
#> <chr> <chr> <dbl>
#> 1 "Pear-Cranberry Mincemeat Lattice Pie " calories 3.01e7
#> 2 "Deep-Dish Wild Blueberry Pie " calories 3.00e7
#> 3 "Apricot, Cranberry and Walnut Pie " calories 1.31e7
#> 4 "Lamb Köfte with Tarator Sauce " calories 4.52e6
#> 5 "Rice Pilaf with Lamb, Carrots, and Raisins " calories 4.16e6
#> 6 "Chocolate-Almond Pie " calories 3.36e6
#> 7 "Caramelized Apple and Pear Pie " calories 3.36e6
#> 8 "Merguez Lamb Patties with Golden Raisin Cousco~ calories 5.45e4
#> 9 "Grilled Lamb Chops with Porcini Mustard " calories 2.41e4
#> 10 "Braised Short Ribs with Red Wine Gravy " calories 1.96e4
#> # i 10,311 more rows
# Calories boxplot
df <- recipes_plot %>%
filter(nutrition == "calories", n_value <= 7000)
boxplot_calories <- df %>%
ggplot(aes(x=nutrition, y=n_value, fill=nutrition)) +
geom_boxplot()+
scale_fill_viridis(discrete = TRUE, alpha=0.6) +
theme(legend.position="none",
plot.title = element_text(size=11)) +
scale_y_continuous(breaks=seq(0,7000,500)) +
labs(x = " ", y = "Calories", title = "Boxplot of calories nutritional value filtered")
boxplot_calories# 1. Create the histogram plot
phist <- gghistogram(
df, x = "n_value",
add = "mean", rug = TRUE, color = "red3"
) +
labs(title="Distribution of calories across all recipes", x="Calories", y="Count") +
theme(plot.title = element_text(size=20)) +
scale_x_continuous(breaks=seq(0,7000,500))
# 2. Create the density plot with y-axis on the right
# Remove x axis elements
pdensity <- ggdensity(
df, x = "n_value", color = "blue",
alpha = 0
) +
scale_y_continuous(expand = expansion(mult = c(0, 0.05)), position = "right") +
theme_half_open(11, rel_small = 1) +
rremove("x.axis")+
rremove("xlab") +
rremove("x.text") +
rremove("x.ticks") +
rremove("legend")
# 3. Align the two plots and then overlay them.
aligned_plots <- align_plots(phist, pdensity, align="hv", axis="tblr")
ggdraw(aligned_plots[[1]]) +
draw_plot(aligned_plots[[2]])After filtering extreme values we can observe that most of the recipes have between 200 and 600 calories. By checking with the histogram and density plot the distribution of calories, we observe that indeed most of the recipes have less than 1000 calories.
4.1.1.3 Protein - Boxplot and Histogram
recipes_plot %>%
filter(nutrition == "protein") %>%
ggplot(aes(x=nutrition, y=n_value, fill=nutrition)) +
geom_boxplot() +
scale_fill_viridis(discrete = TRUE, alpha=0.6, option="A") +
theme_light() +
theme(legend.position="none",
plot.title = element_text(size=11)) +
ggtitle("Boxplot of protein nutritional value") +
xlab("") +
ylab("Value")
We notice that there are recipes with more than 50’000 grams of protein
which are clearly outliers. We must then discard those values in order
to continue with a meaningful analysis. Also from a visual point of view
we could not extract any relevant information. By checking on the
epicurious website the recipes with protein values above 1000, we
verified that such amount of proteins was not justified. We came to that
conclusion by evaluating the average values of protein per 100grams of
each ingredient in the specific recipe.
recipes_plot %>%
filter(nutrition == "protein") %>%
select(title, nutrition, n_value) %>%
arrange(desc(n_value)) #> # A tibble: 10,321 x 3
#> title nutrition n_value
#> <chr> <chr> <dbl>
#> 1 "Rice Pilaf with Lamb, Carrots, and Raisins " protein 236489
#> 2 "Pear-Cranberry Mincemeat Lattice Pie " protein 200968
#> 3 "Deep-Dish Wild Blueberry Pie " protein 200210
#> 4 "Lamb Köfte with Tarator Sauce " protein 166471
#> 5 "Apricot, Cranberry and Walnut Pie " protein 87188
#> 6 "Chocolate-Almond Pie " protein 58334
#> 7 "Caramelized Apple and Pear Pie " protein 58324
#> 8 "Merguez Lamb Patties with Golden Raisin Cousco~ protein 2074
#> 9 "Manhattan Clam Chowder " protein 1625
#> 10 "Clam and Oyster Chowder " protein 1365
#> # i 10,311 more rows
df <- recipes_plot %>%
filter(nutrition == "protein", n_value <= 1000)
# Proteins boxplot
boxplot_protein <- df %>%
ggplot( aes(x=nutrition, y=n_value, fill=nutrition)) +
geom_boxplot() +
scale_fill_viridis(discrete = TRUE, alpha=0.6, option="A") +
theme(legend.position="none",
plot.title = element_text(size=16)) +
scale_y_continuous(breaks=seq(0,1000,25)) +
labs(title="Boxplot of protein nutritional value filtered") +
xlab("") +
ylab("Proteins")
boxplot_protein# 1. Create the histogram plot
phist <- gghistogram(
df, x = "n_value",
add = "mean", rug = TRUE, color = "red3"
) +
labs(title="Distribution of proteins across all recipes", x="Proteins", y="Count") +
theme(plot.title = element_text(size=20)) +
scale_x_continuous(breaks=seq(0,1000,100))
# 2. Create the density plot with y-axis on the right
# Remove x axis elements
pdensity <- ggdensity(
df, x = "n_value", color = "blue",
alpha = 0
) +
scale_y_continuous(expand = expansion(mult = c(0, 0.05)), position = "right") +
theme_half_open(11, rel_small = 1) +
rremove("x.axis")+
rremove("xlab") +
rremove("x.text") +
rremove("x.ticks") +
rremove("legend")
# 3. Align the two plots and then overlay them.
aligned_plots <- align_plots(phist, pdensity, align="hv", axis="tblr")
ggdraw(aligned_plots[[1]]) +
draw_plot(aligned_plots[[2]])From the boxplot, we observe that most recipes have less than 30 grams of proteins. By plotting the histogram and density plot, we verify that this information is correct. We could even extend the range to 100 proteins per recipe. We assume that recipes with values above this threshold contain ingredients like meat, tuna, salmon or shrimps.
4.1.1.4 Sodium - Boxplot and Histogram
recipes_plot %>%
filter(nutrition == "sodium") %>%
ggplot(aes(x=nutrition, y=n_value, fill=nutrition)) +
geom_boxplot() +
scale_fill_viridis(discrete = TRUE, alpha=0.6, option="A") +
theme(legend.position="none",
plot.title = element_text(size=11)) +
labs(title="Boxplot of sodium nutritional value") +
xlab("") +
ylab("Value")We notice that there are recipes with more than 100’000 milligrams of sodium which are clearly outliers. We must then discard those values in order to continue with a meaningful analysis. By conducting further research, we realize that sodium values above 30’000 are highly suspicious.
recipes_plot %>%
filter(nutrition == "sodium") %>%
select(title, nutrition, n_value) %>%
arrange(desc(n_value)) #> # A tibble: 10,321 x 3
#> title nutrition n_value
#> <chr> <chr> <dbl>
#> 1 "Pear-Cranberry Mincemeat Lattice Pie " sodium 27675110
#> 2 "Deep-Dish Wild Blueberry Pie " sodium 27570999
#> 3 "Apricot, Cranberry and Walnut Pie " sodium 12005810
#> 4 "Lamb Köfte with Tarator Sauce " sodium 7540990
#> 5 "Chocolate-Almond Pie " sodium 3449512
#> 6 "Caramelized Apple and Pear Pie " sodium 3449373
#> 7 "Rice Pilaf with Lamb, Carrots, and Raisins " sodium 3134853
#> 8 "Whole Branzino Roasted in Salt " sodium 132220
#> 9 "Red Snapper Baked in Salt with Romesco Sauce " sodium 132025
#> 10 "Scallops with Mushrooms in White-Wine Sauce " sodium 90572
#> # i 10,311 more rows
df <- recipes_plot %>%
filter(nutrition == "sodium", n_value <= 30000)
# Sodium boxplot
boxplot_sodium <- df %>%
ggplot(aes(x=nutrition, y=n_value, fill=nutrition)) +
geom_boxplot() +
scale_fill_viridis(discrete = TRUE, alpha=0.6, option="A") +
theme(legend.position="none",
plot.title = element_text(size=17)) +
scale_y_continuous(breaks=seq(0,30000,500)) +
labs(title="Boxplot of sodium nutritional value") +
xlab("") +
ylab("Sodium")
boxplot_sodium# 1. Create the histogram plot
phist <- gghistogram(
df, x = "n_value",
add = "mean", rug = TRUE, color = "red3"
) +
labs(title="Distribution of sodium across all recipes", x="Sodium", y="Count") +
theme(plot.title = element_text(size=20)) +
scale_x_continuous(breaks=seq(0,30000,3000))
# 2. Create the density plot with y-axis on the right
# Remove x axis elements
pdensity <- ggdensity(
df, x = "n_value", color = "blue",
alpha = 0
) +
scale_y_continuous(expand = expansion(mult = c(0, 0.05)), position = "right") +
theme_half_open(11, rel_small = 1) +
rremove("x.axis")+
rremove("xlab") +
rremove("x.text") +
rremove("x.ticks") +
rremove("legend")
# 3. Align the two plots and then overlay them.
aligned_plots <- align_plots(phist, pdensity, align="hv", axis="tblr")
ggdraw(aligned_plots[[1]]) +
draw_plot(aligned_plots[[2]])From the boxplot we observe that most recipes have sodium values below 750 milligrams. The histogram informs us that most of recipes have indeed less than 750 milligrams of sodium, even though we cannot exclude the presence of a good amount of recipes with sodium between 750 and 2000 milligrams.
4.1.1.5 Fat - Boxplot and Histogram
recipes_plot %>%
filter(nutrition == "fat") %>%
ggplot(aes(x=nutrition, y=n_value, fill=nutrition)) +
geom_boxplot() +
scale_fill_viridis(discrete = TRUE, alpha=0.6, option="A") +
theme_light() +
theme(legend.position="none",
plot.title = element_text(size=11)) +
labs(title="Boxplot of fat nutritional value") +
xlab("") +
ylab("Value")We notice that there are recipes with more than 44’000 grams of fat which are clearly outliers. We must then discard those values in order to continue with a meaningful analysis. By checking on the epicurious website recipes with fat values above 1000, we also verified that the amount of fat was not justified. We came to that conclusion by evaluating the average values of fat per 100grams of each ingredient in the specific recipe.
recipes_plot %>%
filter(nutrition == "fat") %>%
select(title, nutrition, n_value) %>%
arrange(desc(n_value)) #> # A tibble: 10,321 x 3
#> title nutrition n_value
#> <chr> <chr> <dbl>
#> 1 "Pear-Cranberry Mincemeat Lattice Pie " fat 1722763
#> 2 "Deep-Dish Wild Blueberry Pie " fat 1716279
#> 3 "Apricot, Cranberry and Walnut Pie " fat 747374
#> 4 "Rice Pilaf with Lamb, Carrots, and Raisins " fat 221495
#> 5 "Chocolate-Almond Pie " fat 186660
#> 6 "Caramelized Apple and Pear Pie " fat 186642
#> 7 "Lamb Köfte with Tarator Sauce " fat 44198
#> 8 "Grilled Lamb Chops with Porcini Mustard " fat 2228
#> 9 "Braised Short Ribs with Red Wine Gravy " fat 1818
#> 10 "Braised Duck Legs with Shallots and Parsnips " fat 1610
#> # i 10,311 more rows
df <- recipes_plot %>%
filter(nutrition == "fat", n_value <= 40000)
# Fat boxplot
boxplot_fat <- df %>%
ggplot( aes(x=nutrition, y=n_value, fill=nutrition)) +
geom_boxplot() +
scale_fill_viridis(discrete = TRUE, alpha=0.6, option="A") +
theme_light() +
theme(legend.position="none",
plot.title = element_text(size=16)) +
scale_y_continuous(breaks=seq(0,3000,100)) +
labs(title="Boxplot of fat nutritional value filtered") +
xlab("") +
ylab("Fat")
boxplot_fat# 1. Create the histogram plot
phist <- gghistogram(
df, x = "n_value",
add = "mean", rug = TRUE, color = "red3"
) +
labs(title="Distribution of fat across all recipes", x="Fat", y="Count") +
theme(plot.title = element_text(size=20)) +
scale_x_continuous(breaks=seq(0,3000,300))
# 2. Create the density plot with y-axis on the right
# Remove x axis elements
pdensity <- ggdensity(
df, x = "n_value", color = "blue",
alpha = 0
) +
scale_y_continuous(expand = expansion(mult = c(0, 0.05)), position = "right") +
theme_half_open(11, rel_small = 1) +
rremove("x.axis")+
rremove("xlab") +
rremove("x.text") +
rremove("x.ticks") +
rremove("legend")
# 3. Align the two plots and then overlay them.
aligned_plots <- align_plots(phist, pdensity, align="hv", axis="tblr")
ggdraw(aligned_plots[[1]]) +
draw_plot(aligned_plots[[2]])
It is hard to interpret the boxplot. There are certain recipes which
could have potentially more than 1000 or even 2000 grams of fat because
of the high quantity of servings and the use of ingredients such as
lamb, duck and bacon. We must then analyse the histogram to have a
better overview and we notice that most recipes have fat values below
100 grams.
4.1.2 Visual exploration - Multivariate Analysis
4.1.2.1 Scatterplots of Rating-Calories
#removing 41 outliers we discovered above from the recipes df
recipes <- recipes %>%
filter(calories <= 7000, protein <= 1000, sodium <= 30000, fat <= 40000)
# Scatterplot of Rating-Calories
sp1 <- recipes %>%
ggplot(aes(x=calories, y=rating)) +
geom_point(alpha=.5) +
ggtitle("Scatterplot of rating against calories") +
xlab("Calories") +
ylab("Rating")
# Scatterplot of Rating-Protein
sp2 <- recipes %>%
ggplot(aes(x=protein, y=rating)) +
geom_point(alpha=.5) +
ggtitle("Scatterplot of rating against proteins") +
xlab("Proteins") +
ylab("Rating")
# Scatterplot of Rating-Fat
sp3 <- recipes %>%
ggplot(aes(x=fat, y=rating)) +
geom_point(alpha=.5) +
ggtitle("Scatterplot of rating against fat") +
xlab("Fat") +
ylab("Rating")
# Scatterplot of Rating-Sodium
sp4 <- recipes %>%
ggplot(aes(x=sodium, y=rating)) +
geom_point(alpha=.5) +
ggtitle("Scatterplot of rating against sodium") +
xlab("Sodium") +
ylab("Rating")
grid.arrange(sp1, sp2, sp3, sp4, ncol=2, nrow =2)
We can observe that the recipes with more than 2000 calories tend to
have a higher rating. For instance, few recipes with less than a 3 star
rating have more than 2000 calories.
We can observe that the recipes with more than 125 grams of proteins tend to have a higher rating. For instance, few recipes with less than a 3 star rating have more than 125 grams of proteins.
We can observe that the recipes with more than 100 grams of fat tend to have a higher rating. For instance, few recipes with less than a 3 star rating have more than 100 grams of fat.
We can observe that the recipes with more than 5000 milligrams of sodium tend to have a higher rating. For instance, few recipes with less than a 3 star rating have more than 5000 mg of sodium.
4.1.2.2 Correlogram
corr_nutritional_values = recipes %>%
select(rating, calories, protein, fat, sodium) %>%
cor()
corrplot(corr_nutritional_values)
The previous scatterplots illuded us that there was somehow a
correlation between rating and the nutritional values. This hypothesis
has been refuted because the correlation against the rating is almost at
zero for all the nutritional values. On the other hand we notice a
strong positive correlation between calories and fat as well as between
calories and proteins.
4.1.2.3 Grouped Scatter
We decide to plot together the variables which highlight a great level of correlation.
# Grouped scatter of calories and fat
recipes_plot1 <- recipes %>%
filter(fat <= 400, calories <= 6000) %>%
ggplot(aes(x=calories, y=fat, color=rating)) +
geom_point(alpha=.5) +
scale_color_gradientn(colours = rainbow(5))
# Grouped scatter of calories and protein
recipes_plot2 <- recipes %>%
filter(protein <= 500, calories <= 6000) %>%
ggplot(aes(x=calories, y=protein, color=rating)) +
geom_point(alpha=.5) +
scale_color_gradientn(colours = rainbow(5))
# Grouped scatter of protein and fat
recipes_plot3 <- recipes %>%
filter(fat <= 400, protein <= 350) %>%
ggplot(aes(x=protein, y=fat, color=rating)) +
geom_point(alpha=.5) +
scale_color_gradientn(colours = rainbow(5))
# Grouped scatter of protein and sodium
recipes_plot4 <- recipes %>%
filter(sodium <= 400, protein <= 350) %>%
ggplot(aes(x=protein, y=sodium, color=rating)) +
geom_point(alpha=.5) +
scale_color_gradientn(colours = rainbow(5))
grid.arrange(recipes_plot1, recipes_plot2, recipes_plot3, recipes_plot4, ncol=2, nrow =2)
We notice a positive correlation between fat and calories as well as
between protein and calories. We also see a slightly positive
correlation between fat and protein. We tried to understand to what
extent the rating could have an impact on such relationships, but the
number of ratings above 3 is overwhelming and strongly determines the
behavior of these relationships.
4.2 Ingredients EDA
4.2.1 Feature engineering
We discovered that the variable “drinks” on it’s own had only 11 observations, 4 of which also had the value “drink” equal to 1. Therefore, we decided to merge the two columns to simplify working with a single category called “drink” for all drinks.
#Creating a new dataframe with only the ID, title and the ingredients
ingredients_df <- recipes %>%
mutate(drink = ifelse(drink == 1 | drinks == 1, 1, 0)) %>% #merging drinks and drink
select(ID, title, all_of(all_ingredients), rating)4.2.1.1 Creating binary ingredients categories
ingredients_df_bin <- ingredients_df %>%
mutate(vegetables_bin = as.numeric(if_any(all_of(vegetables_vec), ~.x == 1, na.rm = TRUE)),
meats_bin = as.numeric(if_any(all_of(all_meats), ~.x == 1, na.rm = TRUE)),
fish_bin = as.numeric(if_any(all_of(fish_vec), ~.x == 1, na.rm = TRUE)),
seafood_bin = as.numeric(if_any(all_of(seafood_vec), ~.x == 1, na.rm = TRUE)),
herbs_bin = as.numeric(if_any(all_of(herbs_vec), ~.x == 1, na.rm = TRUE)),
nuts_bin = as.numeric(if_any(all_of(nuts_vec), ~.x == 1, na.rm = TRUE)),
fruits_bin = as.numeric(if_any(all_of(fruits_vec), ~.x == 1, na.rm = TRUE)),
cheese_bin = as.numeric(if_any(all_of(cheeses_vec), ~.x == 1, na.rm = TRUE)),
dairy_bin = as.numeric(if_any(all_of(dairy_vec), ~.x == 1, na.rm = TRUE)),
spices_bin = as.numeric(if_any(all_of(spices_vec), ~.x == 1, na.rm = TRUE)),
cereals_bin = as.numeric(if_any(all_of(cereals_vec), ~.x == 1, na.rm = TRUE)),
carbs_bin = as.numeric(if_any(all_of(carbs_vec), ~.x == 1, na.rm = TRUE)),
dessert_bin = as.numeric(if_any(all_of(dessert_vec), ~.x == 1, na.rm = TRUE)),
egg_bin = (egg)
) %>%
select(ID, title, contains("bin"), everything())The fact that both select the same number of rows makes having general categories redundant in the dataset. They are not useful to create the binary columns, and they are also not useful to compute the total amount of ingredients in each category per recipe –> let’s just not include them in the first place
####testing if I still need to include the general category to create the binary column now that I modified the df to only include recipes with ingredients specified
#
# #6586
# ingredients_df %>%
# mutate(vegetables_bin = as.factor(as.numeric(if_any(c(vegetable, all_of(vegetables_vec)), ~.x == 1, na.rm = TRUE)))) %>%
# filter(vegetables_bin == 1)
#
# #6586
# ingredients_df %>%
# mutate(vegetables_bin = as.factor(as.numeric(if_any(all_of(vegetables_vec), ~.x == 1, na.rm = TRUE)))) %>%
# filter(vegetables_bin == 1)4.2.1.2 Creating total ingredients categories
ingredients_df_total <- ingredients_df %>%
mutate(total_ingredients = rowSums(select(., c(all_of(all_ingredients)))),
total_vegetables = rowSums(select(., c(all_of(vegetables_vec)))),
total_meat = rowSums(select(., c(all_of(all_meats)))),
total_fish = rowSums(select(., c(all_of(fish_vec)))),
total_seafood = rowSums(select(., c(all_of(seafood_vec)))),
total_herbs = rowSums(select(., c(all_of(herbs_vec)))),
total_nuts = rowSums(select(., c(all_of(nuts_vec)))),
total_fruits = rowSums(select(., c(all_of(fruits_vec)))),
total_cheese = rowSums(select(., c(all_of(cheeses_vec)))),
total_dairy= rowSums(select(., c(all_of(dairy_vec)))),
total_spices= rowSums(select(., c(all_of(spices_vec)))),
total_cereals= rowSums(select(., c(all_of(cereals_vec)))),
total_carbs = rowSums(select(., c(all_of(carbs_vec)))),
total_dessert = rowSums(select(., c(all_of(dessert_vec))))
) %>%
select(ID, title, contains("total"), everything())4.2.1.3 Creating ingredients_df_full
Creating “ingredients_df_full” which contains bin columns, total columns, and original ingredients columns
total_join <- ingredients_df_total %>%
select(ID, contains("total"))
ingredients_df_full <- ingredients_df_bin %>%
left_join(total_join) %>%
select(ID, title, rating, contains("bin"), contains("total"), everything())4.2.2 “Binary” engineered ingredients categories
4.2.2.1 Frequency of ingredients - binary categories
This gives us interesting information about the frequency of each ingredient being present at least once in a recipe. As we can see, there is at least one vegetable in around 6750 recipes out of the 11380 total we have. Inversely, a very low amount of recipes contains at least one type of cereal.
#creating a vector with colnames of all the binary columns to be able to select them more easily afterwards
binary_columns <- ingredients_df_bin %>%
select(contains("bin")) %>%
colnames()
#adding binary columns to ingredients_df
total_categories <- ingredients_df_bin %>%
select(ID, all_of(binary_columns)) %>%
pivot_longer(-ID, names_to = "category", values_to = "binary_value") %>%
group_by(category) %>%
summarise(total = sum(binary_value))
#plotting the frequency of binary columns
total_categories %>%
ggplot(aes(x=reorder(category,total), y=total, fill=total)) +
geom_bar(stat = "identity") +
scale_x_discrete(guide = guide_axis(n.dodge=3))+
scale_fill_viridis() +
labs(x = "Category", y = "Amount of recipes", title = "Total amount of recipes containing at least one ingredient in defined categories")
Most of the recipes contain vegetables, fruits, meats, herbs and carbs,
which is not a surprise. The barplots below give us similar information
about the amount of recipe which contain at least one ingredient in each
category. The only category for which an ingredient is present at least
once in more than 50% of the recipes is vegetables.
ingredients_df_bin %>%
select(contains("bin")) %>%
mutate(across(everything(), as.factor)) %>%
plot_bar(order_bar = FALSE)4.2.2.2 Relationship between binary ingredients categories and ratings
4.2.2.2.1 Barplots - 7-level rating variable
We now want to check the relationship between our rating
variable and all the binary ingredients variables. We first plot this
relationships with multilevel barplots.
It is not so straightforward to analyse but we can see that for example recipes with no fish have a higher chance of getting a rating of 5 when compared to recipes with fish in them. Recipes with carbs seem to have the lowest frequency of rating at 5 out of all the binary categories.
ingredients_df_bin %>%
select(contains("bin"), rating) %>%
mutate(across(everything(), as.factor)) %>%
plot_bar(by = "rating", order_bar = FALSE)
##### Correlation plot We see some somewhat strong negative correlation
between vegetables and dessert, and between vegetable and fruits. This
makes sense, as these ingredients are rarely found together in recipes.
As a side note, we chose to classify tomato as a vegetable and strongly
stand by this opinion :)
Concerning positive correlations, we see nuts and desert as highly correlated. This is probably because they go well together in sugary recipes. Additionally, egg and dairy are also slightly positively correlated. This most likely comes from patisserie recipes where eggs and dairy ingredients go hand in hand.
When looking at the correlation of the binary ingredients variables
with the 7-level rating (i.e., rating), we see that the
highest negative correlation is between rating and
carbs_bin which could sound a bit surprising given the
assumed high popularity of pasta for example. The highest positive
correlation is with fruit_bin at 0.04. We note however that
all correlations between binary ingredients variables and rating are
disappointingly weak.
Given this low correlation between the “binary” ingredients categories and the 7-level rating, we decided to investigate if we could find a higher correlation by transforming the variable to a binary rating. We decide to put the threshold for a “bad” or “good” rating at 4. Unfortunately, the results are very similar to the 7-level rating variable, with very weak correlations.
#corr plot
ingredients_df_bin %>%
mutate(rating_bin = ifelse(rating>4, 1, 0)) %>%
select(rating, rating_bin, contains("bin")) %>%
plot_correlation(title = 'Correlation of rating with the "binary" ingredients categories')4.2.2.2.2 Barplots - binary rating variable
Despite the low correlation with the binary rating, we still want to visualise it like we did above for the multilevel rating.
ingredients_df_bin %>%
select(contains("bin"), rating) %>%
mutate(rating_bin = ifelse(rating>4, "good", "bad"), across(everything(), as.factor)) %>%
select(-rating) %>%
plot_bar(by = "rating_bin", order_bar = FALSE)We now have only 2 categories: recipes with rating above 4 and recipes with ratings below 4. There is no clear relationship in those graphs either, and this confirms the correlation results that we have found above in the correlation plot.
As far as interpretation goes, if we look at vegetables for example, we can see that the proportion of recipes with ratings above 4 is higher for recipes containing no vegetables, when compared to recipes containing at least one vegetable.
4.2.3 “Total” engineered ingredients variables
#Analysis which single ingredient is present in most recipes
df <- ingredients_df %>%
select(-title, -rating) %>%
pivot_longer(-ID, names_to = "ingredient", values_to = "value")
ing_top10 <- df %>%
group_by(ingredient) %>%
summarise(total = sum(value)) %>%
ungroup() %>%
arrange(desc(total)) %>%
dplyr::slice(1:10)
ing_top10 %>%
# mutate(ingredient = fct_rev(ingredient)) %>%
ggplot(aes(x=reorder(ingredient, total), y=total, fill=ingredient)) +
geom_bar(stat = "identity") +
scale_fill_viridis(discrete = TRUE) +
scale_x_discrete(guide = guide_axis(n.dodge=2))+
labs(x = "Ingredient", y = "Value", title = "Total amount of recipes containing each ingredient\nTop 10")
As we can observe, many recipes contain milk cream, onion, tomato,
salad, egg and garlic. Some of these are versatile as they can be used
for many different kinds of recipes. Onion and garlic are widely used
for giving flavour to many dishes, whereas egg and milk cream can be
used to cook salty and sweet recipes.
4.2.3.1 Correlation between total number of ingredients per category and rating variable
ingredients_df_total %>%
mutate(rating_bin = ifelse(rating>4, 1, 0)) %>%
select(rating, rating_bin, contains("total")) %>%
plot_correlation(title = 'Correlation rating with the "total" ingredients categories')
We notice a negative relationship between the number of fruits and
number of vegetables which makes sense these two kinds of ingredients
are rarely combined in a recipe. The same is true for the number of
vegetables related to the number of desserts. The results are similar to
the previous correlogram with the binary columns. In this case we do not
notice any significant relationship between the multilevel rating rating
and the other variables.
Results are disappointing again for the rating_bin
variable, and remain in line with what we found so far. Correlations
between total number of ingredients and both rating variable are very
weak.
4.2.3.2 Amount of ingredients per recipe
The number of ingredients per recipe is more or less normally distributed, with a mean around 4.75.
#checking some stuff about the new ingredients table
ingredients_df_total %>%
select(ID, title, total_ingredients) %>%
ggplot(aes(x=total_ingredients)) +
geom_bar() + geom_vline(aes(xintercept=mean(total_ingredients)),color="red", linetype="dashed", size=1)+
scale_x_continuous(breaks = seq(1, 12, by = 1)) +
labs(x="Number of ingredients per recipe", y = "Recipe Count", title = "Distrubution of number of ingredients per recipe")#we notice that 117 (no NAs and RAT0, and not duplicated) recipes have 0 ingredients, let's investigate why and how that's possible
ingredients_df_total %>%
filter(total_ingredients==0)
#let's pick recipe ID number 1183 which should have poppy and sesame seeds according to the title
recipes %>%
filter(ID == 1183) %>%
select_if(~ any(. == 1))
#we can see that only 3 variables are equal to 1 here
recipes %>%
filter(ID == 365) %>%
select_if(~ any(. == 1))
recipes %>%
filter(ID == 1089) %>%
select_if(~ any(. == 1))
#####
#QUESTION: do we want to keep those recipes?
#####
ingredients_df_total %>%
filter(total_ingredients >10)Based on this information, we decide to eliminate those 117 observations which don’t contain any ingredients that we have classified in our vectors.
#eliminating all recipes which do not contain any ingredients that we have classified in categories --> within the "all_ingredients" vector
index_zero_ingredient <- ingredients_df_full %>%
filter(total_ingredients == 0) %>% pull(ID)
#removing the 117 recipes with no ingredients listed from recipes and ingredients_df_full
recipes <- recipes %>%
filter(!ID %in% index_zero_ingredient)
ingredients_df_full <- ingredients_df_full %>%
filter(!total_ingredients == 0)We also noticed that 5 ingredients variables are actually present in none of the recipes we have. We therefore remove those 5 ingredient variables, leaving us with 288 binary ingredient variables.
#we noticed that 5 ingredients appear in none of the recipes
ingredients_to_remove <- recipes %>%
select(all_of(all_ingredients)) %>%
mutate(across(all_of(all_ingredients), as.factor)) %>%
select_if(~nlevels(.) == 1) %>% colnames()
#removing those 5 ingredients
recipes <- recipes %>%
select(-all_of(ingredients_to_remove))
all_ingredients <- all_ingredients[!all_ingredients %in% ingredients_to_remove]Besides the total amount of ingredients, let’s check the amount of ingredients per recipe for the top 3 categories in terms of ingredients frequency (i.e., vegetables, fruits, meats)
show_nveg <- ingredients_df_full %>%
filter(vegetables_bin == 1) %>%
select(ID, title, total_vegetables) %>%
ggplot(aes(x=total_vegetables)) +
scale_x_continuous(breaks = seq(1, 9, by = 1)) +
geom_bar() + geom_vline(aes(xintercept=mean(total_vegetables)),color="blue", linetype="dashed", size=1)
show_nfruit <- ingredients_df_full %>%
filter(fruits_bin == 1) %>%
select(ID, title, total_fruits) %>%
ggplot(aes(x=total_fruits)) +
scale_x_continuous(breaks = seq(1, 9, by = 1)) +
geom_bar() + geom_vline(aes(xintercept=mean(total_fruits)),color="blue", linetype="dashed", size=1)
# ingredients_df_full %>%
# select(ID, title, total_meat) %>%
# ggplot(aes(x=total_meat)) +
# geom_bar() + geom_vline(aes(xintercept=mean(total_meat)),color="blue", linetype="dashed", size=1)+
# labs(x="Number of meats per recipe", y = "Recipe Count", title = "Distrubution of number of meats per recipe")
#let's try to filter by recipes which contain meat to see if my functions work
show_nmeat <- ingredients_df_full %>%
filter(meats_bin == 1) %>%
select(ID, title, total_meat) %>%
ggplot(aes(x=total_meat)) +
geom_bar() + geom_vline(aes(xintercept=mean(total_meat)),color="blue", linetype="dashed", size=1)+
labs(x="Number of meats per recipe", y = "Recipe Count", title = "Distrubution of number of meats per recipe")
#why do we still have value in 0 meats --> it was because when creating the total meat column in ingredients_df_test I did not include the general meat category
grid.arrange(show_nveg, show_nfruit, show_nmeat, ncol=2, nrow =2)
As we can observe, most of the recipes have 1 or 2 vegetables and rarely
more than 4. The same is true for fruits. Concerning the meat, it usual
to see one kind of meat, but rare to see 3 or more in the same
recipe.
4.3 Mixed EDA - ingredients and nutritional value
recipes_select <- recipes %>%
select(ID, title, rating, calories, protein, sodium, fat)
ingredients_select <- ingredients_df_total %>%
select(ID, all_of(contains("total")))
recipes_more <- recipes_select %>%
left_join(ingredients_select,
by=c('ID'))
ingredients_bin_select <- ingredients_df_bin %>%
select(ID, contains("bin"))
recipes_full <- recipes_more %>%
left_join(ingredients_bin_select,
by=c('ID'))4.3.1 Correlation between nutritional values and engineered ingredients variables
recipes_full %>%
select(-ID) %>%
plot_correlation()
For instance we notice a positive correlation between proteins and
meats_bin which includes all sorts of meat. Another visible positive
correlation is the one between sodium and seafood_bin. We might also
want to investigate the relationship between calories and carbs_bin.
4.3.2 Barplot and boxplot - Meat and Proteins
meat_average01 <- recipes_full %>%
group_by(meats_bin) %>%
summarise(avg_protein = mean(protein))
# Barplot
barplot1 <- recipes_full %>%
ggplot(aes(x = factor(meats_bin), y = protein)) +
stat_summary(fun = mean, geom = "bar") +
ggtitle("Average amount of proteins per recipe\nwith and without meat") +
xlab("Presence of Meat or not") +
ylab("Protein Content in grams")+
theme(plot.title = element_text(size=9))
# Boxplots per different kinds of meat
recipes_general <- recipes_full %>%
select(ID) %>%
left_join(recipes,
by=c('ID'))
recipes_meat <- recipes_general %>%
select(ID, title, rating, calories, protein, fat, sodium, all_of(all_meats))
recipes_meat <- recipes_meat %>%
pivot_longer(cols=c("beef", "beef_rib", "beef_shank", "beef_tenderloin", "brisket", "ground_beef", "hamburger", "veal", "bacon", "ham", "pork", "pork_chop", "pork_rib", "pork_tenderloin", "prosciutto", "ground_lamb", "lamb", "lamb_chop", "lamb_shank", "rack_of_lamb", "chicken", "duck", "goose", "poultry", "poultry_sausage", "quail", "turkey", "meatball", "meatloaf", "rabbit", "sausage", "steak", "venison" ),
names_to='meats',
values_to='yes_or_no') %>%
filter(yes_or_no == 1)
multi_boxplot1 <- recipes_meat %>%
filter(protein < 450) %>%
ggplot(aes(x=meats, y=protein, fill=meats)) +
geom_boxplot(alpha=0.3) +
theme(legend.position="none",
plot.title = element_text(size=9)) +
scale_y_continuous(breaks=seq(0,7000,25)) +
coord_flip() +
ggtitle("Distribution of proteins per recipe according\nto different kinds of meat") +
xlab("Meats") +
ylab("Protein Content in grams") +
theme(legend.position="none")
# Here we want to show which kinds of meat specifically have a high level of proteins
grid.arrange(barplot1, multi_boxplot1, ncol=2, nrow =1)
It is very clear that among the recipes with meat, the average content
of protein is higher than in recipes without meat. Among those that have
meat we notice that goose, venison and lamb shank have the highest
content of protein.
4.3.3 Barplot and boxplot - Seafood and Sodium
seafood_average01 <- recipes_full %>%
group_by(seafood_bin) %>%
summarise(avg_seafood = mean(sodium))
# Seafood and sodium
barplot2 <- recipes_full %>%
ggplot(aes(x = factor(seafood_bin), y = sodium)) +
stat_summary(fun = mean, geom = "bar") +
ggtitle("Average amount of sodium per recipe\nwith and without seafood") +
xlab("Presence of Seafood or not") +
ylab("Sodium Content in milligrams")+
theme(plot.title = element_text(size=9))
# Boxplots per different kinds of seafood
recipes_seafood <- recipes_general %>%
select(ID, title, rating, calories, protein, fat, sodium, all_of(seafood_vec))
recipes_seafood <- recipes_seafood %>%
pivot_longer(cols=c("clam", "crab", "lobster", "mussel", "octopus", "oyster", "scallop", "shellfish", "shrimp", "squid" ),
names_to='seafoods',
values_to='yes_or_no') %>%
filter(yes_or_no == 1)
multi_boxplot2 <- recipes_seafood %>%
filter(sodium < 10000) %>%
ggplot(aes(x=seafoods, y=sodium, fill=seafoods)) +
geom_boxplot(alpha=0.3) +
theme(legend.position="none",
plot.title = element_text(size=9)) +
scale_y_continuous(breaks=seq(0,30000,500)) +
coord_flip() +
ggtitle("Distribution of sodium per recipe according\nto different kinds of seafood") +
xlab("Seafood") +
ylab("Sodium Content in milligrams") +
theme(legend.position="none")
# Here we want to show which kinds of seafood specifically have a high level of sodium
grid.arrange(barplot2, multi_boxplot2, ncol=2, nrow =1)
In this case it is very clear that among the recipes with seafood, the
average content of sodium is higher than in recipes without seafood.
Among those that have seafood we notice that clams and lobsters have the
highest content of sodium.
4.3.4 Barplot and boxplot - Carbs and Calories
carbs_average01 <- recipes_full %>%
group_by(carbs_bin) %>%
summarise(avg_carbs = mean(calories))
# Carbs and calories
barplot3 <- recipes_full %>%
ggplot(aes(x = factor(carbs_bin), y = calories)) +
stat_summary(fun = mean, geom = "bar") +
ggtitle("Average amount of calories per recipe\nwith and without carbohydrates") +
xlab("Presence of carbohydrates or not") +
ylab("Calories content")+
theme(plot.title = element_text(size=9))
# Afterwards we would also want to show which kinds of carbs specifically have a high number of calories
# Boxplots per different kinds of carbs
recipes_carbs <- recipes_general %>%
select(ID, title, rating, calories, protein, fat, sodium, all_of(carbs_vec))
recipes_carbs <- recipes_carbs %>%
pivot_longer(cols=c("brown_rice", "chickpea", "cornmeal", "couscous", "hominy_cornmeal_masa", "orzo", "pasta", "potato", "rice", "semolina", "sweet_potato_yam", "wild_rice"),
names_to='carbs',
values_to='yes_or_no') %>%
filter(yes_or_no == 1)
multi_boxplot3 <- recipes_carbs %>%
filter(sodium < 10000) %>%
ggplot(aes(x=carbs, y=calories, fill=carbs)) +
geom_boxplot(alpha=0.3) +
theme(legend.position="none",
plot.title = element_text(size=9)) +
scale_y_continuous(breaks=seq(0,7000,500)) +
coord_flip() +
ggtitle("Distribution of calories per recipe according\nto different kinds of food high in carbohydrates") +
xlab("Carbs") +
ylab("Calories Content") +
theme(legend.position="none")
grid.arrange(barplot3, multi_boxplot3, ncol=2, nrow =1)
Recipes which contain carbs register a higher average content of
calories than recipes without carbs. Among those that have carbs we
notice that pasta and chickpeas are the richest in calories.
4.4 Seasons, Recipe Type, and Countries EDA
4.4.1 Seasons
#Create seasons df
seasons_df <- recipes %>%
select(ID, rating, all_of(seasons_vec)) %>%
filter(if_any(all_of(seasons_vec)) == 1) %>%
mutate(sum_season = rowSums(across(all_of(seasons_vec))), rating_bin = ifelse(rating>4, 1, 0)) %>%
relocate(rating_bin, .after = rating)
seasons_df %>%
ggplot(aes(x=sum_season)) +
geom_bar() +
labs(x = "Number of Seasons", y = "Count", title = "Number of different seasons per recipe")# seasons_df %>%
# filter(sum_season==3)
#
# seasons_df %>%
# filter(sum_season==4)
#total of 29 recipes with either 3 or 4 --> let's discard them
#let's look a bit more closely to those with 2 seasons to see if they are next to each other or not
# seasons_df %>%
# filter(sum_season==2)
### should we keep those observations??? no
#removing obs with more than one season
seasons_df <- seasons_df %>%
filter(sum_season==1)As we can see, there is again no correlation between seasons and recipe ratings, be it the 7-level rating or the binary rating.
seasons_df %>%
select(-sum_season, -ID) %>%
plot_correlation(title = "Corrplot of seasons with rating")4.4.2 Recipe Type
#taking into account only the "main" recipe types
recipe_types_select <- c("breakfast", "brunch", "dessert", "dinner", "lunch")
type_df <- recipes %>%
select(ID, rating, all_of(recipe_types_select)) %>%
filter(if_any(all_of(recipe_types_select)) == 1) %>%
mutate(sum_type = rowSums(across(all_of(recipe_types_select))), rating_bin = ifelse(rating>4, 1, 0)) %>%
relocate(rating_bin, .after = rating)
type_df %>%
ggplot(aes(x=sum_type)) +
geom_bar()+
labs(x = "Number of recipe types", y = "Count", title = "Number of different recipe types variables per recipe")# type_df %>%
# filter(sum_type == 2)
# left with 3333 obs after filtering
type_df <- type_df %>%
filter(!sum_type >1)Once again very low correlation for recipe types relative to either rating variables, we will not included them in the analysis.
type_df %>%
select(-c(ID, sum_type)) %>%
plot_correlation(title = "Corrplot of recipe types relative to rating")4.4.3 Countries
Only 54 recipes containing info about the country so we cannot use it either for the analysis.
countries_df <- recipes %>%
select(ID, rating, all_of(countries)) %>%
filter(if_any(all_of(countries)) == 1) %>%
mutate(sum_type= rowSums(across(all_of(countries))))
countries_df %>%
ggplot(aes(x=sum_type)) +
geom_bar()+
labs(x = "Number of Countries per recipe", y = "Count", title = "Number of different countries variables per recipe")5 Unsupervised Learning EDA
5.1 Exploratory PCA Analysis - ALL 680 variables
- With all variables 680
recipes_pca <- recipes %>%
select(-ID, -title) %>%
PCA(ncp = 679, graph = FALSE)
recipes_pca#> **Results for the Principal Component Analysis (PCA)**
#> The analysis was performed on 10163 individuals, described by 674 variables
#> *The results are available in the following objects:
#>
#> name description
#> 1 "$eig" "eigenvalues"
#> 2 "$var" "results for the variables"
#> 3 "$var$coord" "coord. for the variables"
#> 4 "$var$cor" "correlations variables - dimensions"
#> 5 "$var$cos2" "cos2 for the variables"
#> 6 "$var$contrib" "contributions of the variables"
#> 7 "$ind" "results for the individuals"
#> 8 "$ind$coord" "coord. for the individuals"
#> 9 "$ind$cos2" "cos2 for the individuals"
#> 10 "$ind$contrib" "contributions of the individuals"
#> 11 "$call" "summary statistics"
#> 12 "$call$centre" "mean of the variables"
#> 13 "$call$ecart.type" "standard error of the variables"
#> 14 "$call$row.w" "weights for the individuals"
#> 15 "$call$col.w" "weights for the variables"
fviz_pca_var(recipes_pca)
Hard to interpret this PCA output. The two dimensions explain together
only 2.1% of the variability in the data.
fviz_contrib(recipes_pca, choice = "var", axes = 1)fviz_contrib(recipes_pca, choice = "var", axes = 2)
Also in this case it difficult to understand which are contributing to
each dimension since the dimension itself accounts for a little
percentage of variability.
fviz_pca_biplot(recipes_pca) ## biplotfviz_eig(recipes_pca, addlabels = TRUE, ncp=11)5.2 Exploratory PCA Analysis - Selected
- All the variables unless “useless” ones such as us_cities or chef
ingredients_df_pca <- ingredients_df %>%
select(-title, -ID)
recipes_pca2 <- PCA(ingredients_df_pca, ncp = 294, graph = FALSE)
fviz_eig(recipes_pca2, addlabels = TRUE, ncp=11)
The results do not substantially differ from the analysis with 680
variables.
5.3 Exploratory PCA Analysis - Nut + Total + Bin
- Only with nutritional variables, total and bin variables
nutritional_df <- recipes %>%
select(ID, all_of(nutritional_values))
recipes_analysis <- ingredients_df_full %>%
left_join(nutritional_df, by="ID") %>%
mutate(across(all_of(contains("bin"))), ID = as.character(ID)) %>%
select(rating, all_of(nutritional_values), contains("bin"), contains("total"))recipes_pca3 <- PCA(recipes_analysis, ncp = 33, graph = FALSE)
recipes_pca3#> **Results for the Principal Component Analysis (PCA)**
#> The analysis was performed on 10163 individuals, described by 33 variables
#> *The results are available in the following objects:
#>
#> name description
#> 1 "$eig" "eigenvalues"
#> 2 "$var" "results for the variables"
#> 3 "$var$coord" "coord. for the variables"
#> 4 "$var$cor" "correlations variables - dimensions"
#> 5 "$var$cos2" "cos2 for the variables"
#> 6 "$var$contrib" "contributions of the variables"
#> 7 "$ind" "results for the individuals"
#> 8 "$ind$coord" "coord. for the individuals"
#> 9 "$ind$cos2" "cos2 for the individuals"
#> 10 "$ind$contrib" "contributions of the individuals"
#> 11 "$call" "summary statistics"
#> 12 "$call$centre" "mean of the variables"
#> 13 "$call$ecart.type" "standard error of the variables"
#> 14 "$call$row.w" "weights for the individuals"
#> 15 "$call$col.w" "weights for the variables"
fviz_pca_var(recipes_pca3)
Here we can observe that calories and fat are positively correlated with
the first dimension, whereas fruits_bin and total_fruits are negatively
correlated with the second dimension.
fviz_contrib(recipes_pca3, choice = "var", axes = 1)fviz_contrib(recipes_pca3, choice = "var", axes = 2)We notice that nutritional values, meats_bin, vegetables_bin, total_dessert, total_meat and a few other engineered variables make a significant contribution to the first dimension.
fviz_pca_biplot(recipes_pca3) ## biplot
Here we see that most points are distributed on the positive side of
dimension 1 and equally spread on dimension 2. We notice extreme values
on the top right, where dimension 1 and dimension 2 are highly
positive.
fviz_eig(recipes_pca3, addlabels = TRUE, ncp=11)
Here the first 10 dimensions account for 70.9% of total explained
variance.
p1 <- fviz_pca_biplot(recipes_pca3, axes = 1:2)
p2 <- fviz_pca_biplot(recipes_pca3, axes = 3:4)
p3 <- fviz_pca_biplot(recipes_pca3, axes = 5:6)
p4 <- fviz_pca_biplot(recipes_pca3, axes = 7:8)
p5 <- fviz_pca_biplot(recipes_pca3, axes = 9:10)
grid.arrange(p1, p2, p3, p4, p5, nrow = 3, ncol=2)
This is an overview of the 10 selected dimensions. However, it is hard
to detect clear patterns.
5.4 Clustering - ALL
Let us try to apply the clustering techniques to the data set with all 680 variables. First we have to scale all the numerical features except the variable rating.
recipes_clust <- ingredients_df %>%
select(-ID, -title) %>%
filter(row_number() <= n() / 2)
recipes_clust[,-294] <- scale(recipes_clust[,-294])Due to high number of values in our dataset, R was unable to run the hierarchical clustering. Hence, we had to consider only half of the dataset to obtain results in a reasonable amount of time. We acknowledge that this could have a negative impact on the quality of the results.
5.4.1 Hierarchical Clustering
We apply agglomerative hierarchical clustering
recipes_d <- dist(recipes_clust[,-294], method = "manhattan") # matrix of Manhattan distances
recipes_melt <- melt(as.matrix(recipes_d)) # create a data frame of the distances in long format
head(recipes_melt)#> Var1 Var2 value
#> 1 1 1 0.00
#> 2 2 1 80.81
#> 3 3 1 61.89
#> 4 4 1 80.82
#> 5 5 1 89.51
#> 6 6 1 96.18
It is impossible to extract any information from such a graph.
5.4.2 Dendrogram
We build a dendrogram using the complete linkage
recipes_hc <- hclust(recipes_d, method = "complete")
plot(recipes_hc, hang=-1)
plot(recipes_hc, hang=-1)
rect.hclust(recipes_hc, k=8) # we cut the tree to 8 clustersrecipes_clust_cut <- cutree(recipes_hc, k=8)We cut the tree to 8 clusters, and represent the result. We also extract the cluster assignment of each recipe.
5.4.3 Interpretation of the clusters
Here we analyze the clusters by looking at the distribution of the features within each cluster
recipes_comp <- data.frame(recipes_clust[,-294], Clust=factor(recipes_clust_cut), Id=row.names(recipes_clust))
recipes_clust_df <- melt(recipes_comp, id=c("Id", "Clust"))
head(recipes_clust_df)#> Id Clust variable value
#> 1 1 1 artichoke -0.08038
#> 2 2 1 artichoke -0.08038
#> 3 3 1 artichoke -0.08038
#> 4 4 1 artichoke -0.08038
#> 5 5 1 artichoke -0.08038
#> 6 6 1 artichoke -0.08038
ggplot(recipes_clust_df, aes(y=value, group=Clust, fill=Clust)) +
geom_boxplot() +
facet_wrap(~variable)
Again very hard to interpret such results. There are some clusters that
show high presence of certain variables. For instance the 5th cluster
indicates higher values of tapioca, granola, oat and pie compares to all
the rest. The 7th cluster also shows higher values for meatball and
turkey with respect to all the other variables.
5.4.4 Choice of the number of clusters
To choose the number of clusters we can inspect the dendrogram or rely on statistics such as sum-of-squares, the GAP statistics, and the silhouette. In our case we choose the silhouette method.
fviz_nbclust(recipes_clust[,-294],
hcut, hc_method="complete",
hc_metric="manhattan",
method = "silhouette",
k.max = 25, verbose = FALSE)
These methods are not easy to interpret. The silhouette method chooses 2
clusters while the sum-of-squares and gap method did not provide any
clear information in this regard.
5.5 Clustering - NuTo
Let us try to apply unsupervised learning to the data set with only numerical variables. We want to avoid making the clustering on binary variables. As already did before we have to scale all the numerical features except the variable rating.
recipes_clust2 <- ingredients_df_full %>%
left_join(nutritional_df, by="ID") %>%
mutate(across(all_of(contains("bin")), as.factor) , ID = as.character(ID)) %>%
select(rating, -ID, -title, all_of(nutritional_values), contains("total")) %>%
filter(row_number() <= n() / 2)
recipes_clust2[,-1] <- scale(recipes_clust2[,-1])5.5.1 Hierarchical Clustering
We apply agglomerative hierarchical clustering
recipes_d2 <- dist(recipes_clust2[,-1], method = "manhattan") # matrix of Manhattan distances
recipes_melt2 <- melt(as.matrix(recipes_d2)) # create a data frame of the distances in long format
head(recipes_melt2)#> Var1 Var2 value
#> 1 1 1 0.000
#> 2 2 1 8.507
#> 3 3 1 12.145
#> 4 4 1 11.999
#> 5 5 1 11.465
#> 6 6 1 20.061
It is impossible to extract any information from such a graph.
5.5.2 Dendrogram
We build a dendrogram using the complete linkage
recipes_hc2 <- hclust(recipes_d2, method = "complete")
plot(recipes_hc2, hang=-1)
plot(recipes_hc2, hang=-1)
rect.hclust(recipes_hc2, k=8) # we cut the tree to 8 clustersrecipes_clust_cut2 <- cutree(recipes_hc2, k=8)We cut the tree to 8 clusters, and represent the result. We also extract the cluster assignment of each recipe.
5.5.3 Interpretation of the clusters
Here we analyze the clusters by looking at the distribution of the features within each cluster
recipes_comp2 <- data.frame(recipes_clust2[,-1], Clust=factor(recipes_clust_cut2), Id=row.names(recipes_clust2))
recipes_clust_df2 <- melt(recipes_comp2, id=c("Id", "Clust"))
head(recipes_clust_df2)#> Id Clust variable value
#> 1 1 1 calories -0.1216
#> 2 2 1 calories -0.1647
#> 3 3 1 calories 0.8561
#> 4 4 1 calories -0.6011
#> 5 5 1 calories 0.2080
#> 6 6 1 calories -0.4401
ggplot(recipes_clust_df2, aes(y=value, group=Clust, fill=Clust)) +
geom_boxplot() +
facet_wrap(~variable)
Again very hard to interpret such results. There are some clusters that
show high presence of certain variables. For instance the 2nd cluster
indicates higher values of total_nuts, total_fruits, total_dairy and
total_dessert, hence this cluster should represent dessert recipes. The
8th cluster contains high values of protein, sodium, total_seafood,
total_spices and total_carbs. This might be associated with recipes
containing seafood which are high in sodium and protein.
5.5.4 Choice of the number of clusters
To choose the number of clusters we can inspect the dendrogram or rely on statistics such as sum-of-squares, the GAP statistics, and the silhouette.
fviz_nbclust(recipes_clust2[,-1],
hcut, hc_method="complete",
hc_metric="manhattan",
method = "wss",
k.max = 25, verbose = FALSE)fviz_nbclust(recipes_clust2[,-1],
hcut, hc_method="complete",
hc_metric="manhattan",
method = "silhouette",
k.max = 25, verbose = FALSE)
These methods are not easy to interpret. The silhouette method chooses 2
clusters while the sum-of-squares method did not provide any clear
information in this regard. There seems to be an elbow at the 5th, 10th
and 16th cluster.
5.5.5 Dendrogram
We build a dendrogram using the complete linkage
plot(recipes_hc2, hang=-1)
rect.hclust(recipes_hc2, k=10) # we cut the tree to 10 clustersrecipes_clust_cut3 <- cutree(recipes_hc2, k=10)Now we want to focus on 10 clusters.
5.5.6 Interpretation of the clusters
Here we analyze the clusters by looking at the distribution of the features within each cluster
recipes_comp3 <- data.frame(recipes_clust2[,-1], Clust=factor(recipes_clust_cut3), Id=row.names(recipes_clust2))
recipes_clust_df3 <- melt(recipes_comp3, id=c("Id", "Clust"))
head(recipes_clust_df3)#> Id Clust variable value
#> 1 1 1 calories -0.1216
#> 2 2 1 calories -0.1647
#> 3 3 2 calories 0.8561
#> 4 4 1 calories -0.6011
#> 5 5 2 calories 0.2080
#> 6 6 1 calories -0.4401
ggplot(recipes_clust_df3, aes(y=value, group=Clust, fill=Clust)) +
geom_boxplot() +
facet_wrap(~variable) +
ggtitle("Interpretation of 10 clusters according to 19 variables")5.6 K-Means NuTo
fviz_nbclust(recipes_clust2[,-1],
kmeans,
method = "wss",
k.max = 25, verbose = FALSE)fviz_nbclust(recipes_clust2[,-1],
kmeans,
method = "silhouette",
k.max = 25, verbose = FALSE)recipes_km2 <- kmeans(recipes_clust2[,-1], centers=10)
recipes_km2$cluster#> [1] 9 6 3 6 3 8 10 6 6 9 10 3 9 9 6 6 9 7 6 6 6
#> [22] 7 9 6 9 9 7 6 6 3 6 3 6 7 6 2 6 8 9 6 2 10
#> [43] 10 2 7 6 8 10 9 3 10 3 3 3 7 3 8 3 6 2 9 7 5
#> [64] 9 10 6 3 7 2 6 2 7 7 2 7 6 9 6 9 2 6 6 8 8
#> [85] 6 5 6 6 9 3 5 6 6 2 9 5 7 10 6 8 7 6 6 6 5
#> [106] 6 6 2 2 8 10 3 8 10 6 3 3 8 7 7 8 3 3 3 2 10
#> [127] 6 10 2 2 6 2 9 2 8 2 9 10 6 3 6 3 5 7 8 2 2
#> [148] 4 7 9 6 7 8 6 10 7 6 3 9 8 3 1 6 10 2 7 8 3
#> [169] 6 3 6 3 9 3 9 6 8 8 3 6 2 6 9 8 6 3 7 10 1
#> [190] 6 7 6 10 8 7 3 3 7 6 6 8 8 3 6 6 2 5 2 3 9
#> [211] 6 3 6 6 3 8 8 10 9 3 6 6 8 2 10 8 9 9 7 9 6
#> [232] 2 6 6 6 3 9 7 10 6 2 6 10 6 8 8 3 9 9 3 7 8
#> [253] 3 8 6 3 3 8 8 2 6 6 10 5 7 2 2 7 3 6 5 9 7
#> [274] 8 3 10 6 3 6 2 2 7 3 2 2 6 3 5 6 9 5 6 6 8
#> [295] 3 3 5 6 10 6 3 8 6 6 10 2 3 2 8 8 3 6 6 6 3
#> [316] 2 8 5 6 6 6 2 3 8 7 9 10 2 7 8 8 6 6 6 9 8
#> [337] 2 8 2 8 10 6 8 2 8 9 4 8 9 7 10 2 6 8 7 6 2
#> [358] 3 6 6 3 10 3 3 6 3 7 6 6 3 6 10 3 3 6 6 6 2
#> [379] 3 7 7 10 10 10 8 9 6 6 3 7 6 9 3 6 3 6 2 10 2
#> [400] 6 2 3 3 7 3 6 3 8 7 6 6 9 3 9 8 6 7 9 2 6
#> [421] 9 7 8 4 3 6 6 3 6 9 10 3 7 5 8 6 2 6 9 2 3
#> [442] 6 9 8 7 6 2 3 8 6 3 6 3 6 6 9 6 3 6 2 6 6
#> [463] 3 6 8 7 10 3 6 9 7 5 8 3 1 10 3 6 7 2 8 7 10
#> [484] 3 6 6 6 2 10 6 3 6 6 6 6 7 6 8 8 3 8 8 6 7
#> [505] 7 6 7 6 9 6 6 9 2 6 9 3 2 3 8 8 3 6 7 8 6
#> [526] 1 10 3 6 6 6 6 6 3 10 7 6 5 6 6 7 6 6 7 5 5
#> [547] 3 8 5 2 6 6 3 3 8 5 8 10 7 8 6 8 6 6 9 10 6
#> [568] 9 6 6 9 10 9 10 3 8 2 3 6 6 3 8 3 6 3 8 5 6
#> [589] 6 6 7 9 5 6 9 9 7 1 10 6 6 7 3 6 6 6 10 6 9
#> [610] 9 2 9 7 8 8 8 3 3 7 5 7 2 6 2 9 6 3 2 10 9
#> [631] 8 3 8 9 5 6 2 10 6 8 6 7 6 6 6 6 7 3 3 6 3
#> [652] 6 1 7 3 2 6 7 6 6 9 5 7 3 7 5 7 7 3 6 7 3
#> [673] 2 7 2 5 10 8 8 2 3 6 6 10 6 6 9 8 3 8 2 6 3
#> [694] 5 2 10 10 8 8 8 3 10 6 7 7 3 10 6 2 7 7 6 8 2
#> [715] 9 10 6 2 2 6 7 6 7 7 10 10 3 3 3 9 8 6 6 6 6
#> [736] 7 9 3 6 6 10 9 2 2 3 6 7 9 6 6 10 6 3 6 6 8
#> [757] 8 1 7 7 7 9 3 7 9 6 5 6 3 2 6 3 8 2 7 7 6
#> [778] 9 6 9 3 2 2 10 9 5 2 6 10 1 9 3 9 3 9 9 5 9
#> [799] 9 3 6 4 9 2 5 5 3 3 3 6 8 6 6 2 2 9 5 3 6
#> [820] 6 10 7 7 6 2 2 6 10 2 10 8 2 7 10 6 8 6 7 10 8
#> [841] 2 8 10 8 3 6 6 8 6 8 2 6 8 8 8 10 6 7 6 9 10
#> [862] 3 6 9 2 3 10 3 6 7 8 2 3 9 10 3 2 6 9 6 9 2
#> [883] 9 6 3 3 6 7 6 8 5 5 9 5 5 2 3 6 9 7 9 5 8
#> [904] 6 9 3 3 8 8 3 5 6 8 5 6 6 9 10 10 3 7 3 6 6
#> [925] 6 7 3 6 10 5 8 3 8 7 2 6 6 9 2 8 7 6 6 4 3
#> [946] 9 9 5 8 6 2 3 6 10 6 3 6 9 3 3 6 6 8 6 7 3
#> [967] 8 7 3 6 10 2 9 8 8 7 6 6 9 3 3 3 10 9 6 6 10
#> [988] 7 6 2 3 6 2 5 6 8 6 5 8 10 6 6 6 3 2 9 10 2
#> [1009] 6 6 6 7 8 3 3 2 7 3 9 6 6 8 9 9 9 3 10 3 3
#> [1030] 2 3 8 10 9 5 2 6 6 7 2 3 8 2 10 6 8 2 9 6 3
#> [1051] 10 6 8 8 6 8 8 7 9 7 7 5 9 8 7 6 10 6 3 7 6
#> [1072] 6 10 3 2 2 3 7 6 7 6 6 6 6 8 6 2 10 7 8 10 3
#> [1093] 3 6 3 8 6 6 10 6 9 5 2 10 3 6 9 6 6 7 8 5 6
#> [1114] 6 10 3 6 9 10 2 8 6 9 9 2 6 6 1 8 3 8 3 7 10
#> [1135] 9 3 6 2 9 10 2 8 9 6 3 9 6 2 9 9 2 3 6 10 9
#> [1156] 2 6 7 10 7 2 3 9 7 3 6 9 7 8 5 7 8 2 6 7 7
#> [1177] 8 3 6 9 3 3 6 2 9 9 7 9 3 7 9 6 6 2 6 8 8
#> [1198] 8 6 3 8 5 7 2 9 3 6 6 3 6 6 6 6 2 6 3 6 7
#> [1219] 9 3 10 6 3 3 9 3 9 3 9 6 6 6 7 6 2 6 4 9 4
#> [1240] 6 3 3 8 6 6 9 3 5 3 6 10 3 8 5 6 2 3 8 7 10
#> [1261] 6 6 3 10 5 8 7 10 8 6 6 3 6 6 10 8 3 2 2 7 6
#> [1282] 6 6 6 10 6 3 6 10 7 6 7 10 3 3 6 6 8 1 6 8 6
#> [1303] 8 7 6 6 3 6 2 9 6 3 6 3 2 10 8 6 8 3 3 6 9
#> [1324] 9 7 6 9 8 8 3 6 6 6 3 6 6 6 9 2 6 7 2 10 3
#> [1345] 3 8 6 3 6 3 2 3 3 7 8 6 1 3 3 8 3 10 9 7 3
#> [1366] 5 3 3 3 5 2 8 6 6 6 6 6 6 9 6 3 10 8 10 1 6
#> [1387] 7 6 2 2 2 3 6 9 7 8 6 6 10 6 10 3 6 2 8 8 2
#> [1408] 2 3 9 2 10 6 2 8 3 1 2 3 6 5 7 3 6 9 3 9 7
#> [1429] 9 7 9 6 8 8 9 3 9 3 6 6 6 6 3 3 6 6 6 9 6
#> [1450] 9 8 9 9 6 8 5 8 10 7 9 3 9 2 3 8 6 6 6 9 6
#> [1471] 7 2 3 9 6 3 8 7 9 6 9 6 9 3 10 3 8 7 2 1 6
#> [1492] 6 9 7 2 5 3 6 6 2 6 7 8 3 8 3 7 6 2 10 6 6
#> [1513] 8 3 10 10 3 6 6 6 2 2 6 7 2 7 6 5 1 8 6 2 2
#> [1534] 3 6 1 10 2 6 7 6 9 1 6 8 2 6 6 3 6 9 6 6 10
#> [1555] 3 6 10 6 6 2 3 6 5 3 9 8 6 7 5 8 6 5 8 7 6
#> [1576] 3 8 3 3 3 6 6 9 3 7 6 2 6 9 4 10 6 5 3 5 7
#> [1597] 6 5 3 6 6 2 2 9 2 2 2 6 8 8 6 6 8 6 9 2 6
#> [1618] 6 3 6 2 9 2 9 2 2 9 6 1 7 9 8 7 3 8 6 3 6
#> [1639] 6 9 6 10 5 3 8 10 3 7 6 9 8 2 10 7 9 2 7 5 10
#> [1660] 6 2 9 6 10 9 8 9 6 6 9 9 8 7 9 6 3 6 6 6 2
#> [1681] 9 7 3 7 2 5 6 3 6 3 5 2 6 7 6 6 3 8 7 6 9
#> [1702] 9 6 2 6 6 9 7 7 7 6 3 2 2 3 1 3 3 7 9 5 6
#> [1723] 5 6 3 10 2 7 8 7 9 3 6 9 2 9 2 7 10 6 6 9 6
#> [1744] 10 2 8 5 3 6 7 1 6 2 9 2 6 7 7 8 3 9 6 9 9
#> [1765] 3 8 6 6 8 1 6 3 6 8 10 3 2 9 2 8 2 8 6 6 2
#> [1786] 6 8 1 6 3 5 6 3 2 3 6 6 9 8 8 3 9 10 6 10 7
#> [1807] 6 6 8 6 7 8 9 9 9 3 3 2 5 5 9 6 6 10 6 3 6
#> [1828] 9 8 2 7 6 3 6 8 9 3 2 3 1 6 7 7 6 8 10 6 6
#> [1849] 8 6 9 6 2 5 6 6 3 2 9 7 6 6 6 6 9 6 8 7 3
#> [1870] 5 9 2 9 9 7 3 2 10 6 6 8 2 3 2 9 9 9 2 3 7
#> [1891] 9 9 6 7 6 2 6 5 6 6 8 3 2 8 3 7 9 7 7 7 3
#> [1912] 3 6 8 8 3 6 7 10 2 5 2 3 8 6 3 1 6 6 10 5 2
#> [1933] 6 6 7 9 10 9 10 5 6 3 6 2 6 3 9 7 3 5 6 8 7
#> [1954] 2 5 8 3 7 7 9 7 8 2 3 3 3 6 7 9 9 6 3 2 6
#> [1975] 2 8 6 6 9 3 10 6 7 7 7 7 8 9 2 5 3 6 8 5 3
#> [1996] 6 3 8 3 6 9 7 2 2 10 6 9 2 6 7 3 8 10 9 10 8
#> [2017] 2 8 3 3 7 9 3 8 6 7 3 5 9 6 6 7 8 6 3 8 6
#> [2038] 9 8 3 8 8 3 9 7 2 6 10 9 9 10 3 6 1 7 3 3 8
#> [2059] 6 6 8 6 10 3 8 6 5 9 10 8 6 7 6 6 5 6 9 2 6
#> [2080] 3 8 3 2 3 6 9 6 9 3 3 8 10 6 9 8 3 3 3 7 6
#> [2101] 6 9 3 6 9 7 6 3 6 6 2 3 3 6 9 4 10 7 6 8 3
#> [2122] 8 6 6 7 3 6 6 3 6 6 8 2 5 7 6 7 6 9 6 6 7
#> [2143] 9 5 6 8 9 5 6 8 6 10 9 9 6 6 8 3 6 6 6 2 3
#> [2164] 2 6 5 6 9 9 7 6 8 10 9 2 10 8 6 9 2 6 9 9 8
#> [2185] 3 6 9 6 7 6 6 3 2 6 3 2 6 7 8 7 9 6 2 3 8
#> [2206] 3 6 10 6 6 5 8 7 10 2 5 2 2 8 9 8 8 8 8 2 6
#> [2227] 3 6 6 3 2 8 5 9 6 5 8 6 7 9 8 2 6 6 6 5 6
#> [2248] 2 3 3 6 3 3 2 10 7 9 8 2 9 7 6 6 10 7 6 9 6
#> [2269] 9 6 7 2 8 9 6 9 3 10 10 3 3 8 6 10 8 3 2 2 8
#> [2290] 8 6 6 3 8 1 7 7 3 6 8 3 3 3 5 8 3 6 6 2 9
#> [2311] 6 3 2 9 3 3 2 2 3 3 10 6 2 3 2 7 3 2 7 2 8
#> [2332] 8 6 3 2 5 2 2 5 10 2 6 10 6 3 3 6 6 7 6 2 6
#> [2353] 9 6 6 9 7 6 6 8 6 9 2 3 5 9 5 7 6 5 6 10 7
#> [2374] 8 9 2 6 6 2 5 7 6 3 6 9 9 2 6 5 6 9 7 8 6
#> [2395] 7 7 8 7 9 10 8 5 5 6 7 10 9 3 6 2 10 8 7 2 6
#> [2416] 6 8 8 6 3 3 6 6 10 10 2 8 8 8 3 9 6 9 3 9 7
#> [2437] 7 8 6 2 6 8 7 2 6 6 10 8 2 3 8 5 3 6 9 10 7
#> [2458] 2 3 5 7 3 3 5 8 10 8 3 6 6 7 6 3 5 10 6 6 3
#> [2479] 2 9 5 5 8 6 2 2 6 3 3 7 9 7 3 6 8 3 2 10 6
#> [2500] 7 3 8 3 7 6 8 3 3 10 6 2 6 1 6 9 2 6 10 10 10
#> [2521] 6 5 2 7 2 6 9 3 9 9 7 3 9 9 2 6 2 9 1 3 6
#> [2542] 6 7 3 7 6 6 8 6 9 5 10 9 10 10 9 2 6 9 7 8 2
#> [2563] 8 9 7 6 6 2 6 6 8 2 2 9 6 6 9 8 6 8 5 7 9
#> [2584] 10 5 6 6 5 3 6 9 9 6 3 6 7 1 6 6 3 6 6 6 3
#> [2605] 9 2 9 8 2 8 6 6 3 2 6 3 9 3 6 8 3 5 6 9 4
#> [2626] 9 2 10 3 6 6 8 2 9 9 3 8 6 6 6 8 6 2 10 9 6
#> [2647] 9 8 10 7 10 6 3 7 10 9 5 8 8 8 2 6 1 2 9 6 8
#> [2668] 9 1 10 8 8 8 9 3 6 9 6 9 2 8 7 3 8 8 3 9 10
#> [2689] 3 6 6 9 8 7 9 3 2 5 2 6 7 6 6 2 7 10 6 9 9
#> [2710] 6 10 2 6 9 6 9 6 3 3 1 5 6 6 3 5 2 6 3 7 6
#> [2731] 5 9 9 2 6 10 6 7 3 6 6 9 10 3 3 5 6 6 6 3 6
#> [2752] 6 8 6 6 9 9 9 6 9 6 10 6 10 6 8 6 8 9 9 6 6
#> [2773] 6 3 6 6 6 2 2 2 2 5 8 6 3 7 10 3 9 7 8 1 6
#> [2794] 2 6 10 9 6 9 2 2 5 5 7 7 7 2 6 6 8 9 6 8 8
#> [2815] 6 6 9 6 9 3 6 6 6 9 3 6 6 7 6 3 6 9 6 1 9
#> [2836] 3 3 2 3 6 6 9 6 6 8 3 9 6 9 8 10 8 6 6 8 6
#> [2857] 3 6 6 9 2 2 2 3 8 5 3 9 3 3 2 6 8 7 3 3 10
#> [2878] 2 5 6 9 6 10 6 6 6 6 10 6 2 6 2 10 2 10 7 1 8
#> [2899] 2 9 3 9 1 6 9 5 3 3 6 9 3 2 2 5 6 6 9 8 3
#> [2920] 3 3 2 6 7 9 8 10 7 3 3 6 3 6 7 3 8 2 3 6 7
#> [2941] 6 6 2 2 9 6 3 9 2 7 7 10 1 7 7 3 5 10 6 3 7
#> [2962] 6 6 8 5 2 2 8 3 3 2 9 8 3 5 6 9 2 9 6 6 6
#> [2983] 6 6 3 6 7 2 2 3 6 10 5 6 7 3 10 8 7 6 6 7 3
#> [3004] 9 7 3 6 2 9 7 9 5 9 8 3 6 2 6 6 5 8 10 2 2
#> [3025] 3 1 4 5 9 10 9 7 3 8 6 1 8 3 9 6 4 3 3 3 2
#> [3046] 7 2 8 6 2 7 3 2 7 3 6 6 6 6 10 6 2 3 7 2 2
#> [3067] 6 6 7 7 9 3 7 8 8 7 10 2 3 5 7 1 8 7 8 8 3
#> [3088] 8 9 6 9 2 7 3 6 7 7 9 9 2 3 3 7 8 9 6 5 9
#> [3109] 5 3 3 6 6 2 9 9 5 6 4 6 1 8 2 3 6 6 2 6 6
#> [3130] 8 6 6 6 9 3 6 7 9 3 6 7 6 6 9 9 6 3 5 6 8
#> [3151] 6 2 6 9 8 3 3 7 3 2 6 6 2 8 5 7 5 6 6 6 10
#> [3172] 3 6 3 6 6 6 8 9 6 3 7 6 6 8 6 6 3 9 3 8 3
#> [3193] 6 2 5 6 6 5 9 5 6 6 9 3 3 2 6 8 9 7 8 3 2
#> [3214] 8 9 7 8 9 2 3 8 10 7 2 5 3 8 6 9 8 3 7 6 8
#> [3235] 10 8 6 2 8 2 9 2 6 2 9 5 6 8 3 2 6 9 8 10 2
#> [3256] 10 6 6 6 9 6 6 3 3 6 10 6 9 9 8 8 6 6 3 6 6
#> [3277] 9 6 3 5 7 6 8 2 5 9 2 6 6 9 3 2 3 9 6 6 8
#> [3298] 6 8 8 1 8 3 9 6 6 9 3 2 3 3 3 7 6 7 9 3 10
#> [3319] 3 2 9 8 6 2 7 6 6 7 9 7 7 1 7 10 3 6 6 6 6
#> [3340] 3 6 5 5 2 9 6 2 9 3 9 2 6 6 10 9 6 10 3 10 8
#> [3361] 6 6 10 1 3 9 8 7 3 6 9 7 7 8 8 6 3 8 8 3 5
#> [3382] 2 6 8 9 7 6 9 6 6 9 6 2 10 6 7 7 8 6 2 2 6
#> [3403] 3 6 10 3 8 8 6 2 5 6 2 9 8 3 6 8 3 8 5 8 2
#> [3424] 2 2 8 7 5 6 8 10 8 1 6 2 9 7 7 6 9 2 7 6 6
#> [3445] 7 7 2 3 9 7 3 3 2 3 3 6 2 8 8 3 6 3 8 3 7
#> [3466] 10 8 6 3 10 9 6 2 2 10 7 2 3 6 6 3 6 2 3 6 3
#> [3487] 10 2 2 7 3 3 2 7 3 8 10 10 9 6 9 9 6 7 6 5 2
#> [3508] 6 6 2 6 8 3 9 9 10 1 6 2 7 3 3 8 8 8 6 6 2
#> [3529] 3 2 9 2 7 6 8 7 8 7 1 6 2 2 3 8 2 3 6 3 6
#> [3550] 6 5 2 6 9 8 6 7 3 9 1 9 6 8 5 7 6 9 10 8 10
#> [3571] 9 6 9 3 2 2 10 5 10 6 7 6 7 2 6 6 7 8 10 2 3
#> [3592] 3 9 8 2 8 3 9 5 2 6 2 6 6 6 7 6 7 3 6 9 3
#> [3613] 6 2 3 8 6 6 3 3 9 6 7 9 7 5 9 9 6 6 6 8 8
#> [3634] 3 9 5 6 9 9 8 4 10 6 6 3 9 5 1 5 2 6 6 6 9
#> [3655] 6 6 7 9 7 7 8 8 3 6 8 9 7 2 9 9 8 9 9 2 3
#> [3676] 7 9 7 7 6 3 6 6 2 6 3 9 3 7 3 3 10 10 10 6 3
#> [3697] 3 6 7 9 10 9 6 7 3 6 3 3 9 3 3 5 6 9 6 5 6
#> [3718] 8 3 6 6 8 9 8 3 2 9 2 3 7 6 8 2 6 2 9 7 3
#> [3739] 7 9 1 5 5 6 6 3 3 6 9 7 2 6 6 6 7 3 2 6 2
#> [3760] 6 3 8 7 9 5 9 2 3 6 8 3 6 6 2 7 8 9 6 3 6
#> [3781] 6 6 9 6 9 5 9 5 10 6 9 6 6 3 6 2 7 6 1 9 6
#> [3802] 6 2 3 3 7 3 2 3 3 6 9 3 7 3 6 3 6 7 6 3 3
#> [3823] 7 4 2 5 3 9 9 6 10 9 3 2 3 8 3 9 6 3 7 2 3
#> [3844] 6 6 6 1 3 9 8 6 2 5 6 3 7 9 7 6 6 6 6 10 6
#> [3865] 7 3 7 3 2 9 10 6 2 2 3 2 2 5 6 6 6 6 8 9 6
#> [3886] 6 1 9 3 7 3 3 9 9 9 5 3 6 8 3 9 2 9 5 5 6
#> [3907] 3 6 6 10 10 3 5 6 6 9 8 2 3 3 3 9 7 9 8 6 3
#> [3928] 6 7 6 9 6 9 9 2 8 9 6 2 9 10 10 6 9 3 5 6 8
#> [3949] 6 9 1 6 7 7 2 9 10 6 6 9 6 10 7 2 10 3 8 3 8
#> [3970] 2 5 2 10 7 3 2 5 6 6 7 9 2 6 3 7 7 6 3 6 2
#> [3991] 6 6 5 2 1 6 9 3 2 7 10 6 6 2 3 9 2 9 5 8 8
#> [4012] 2 3 2 3 6 5 3 5 5 8 9 2 3 2 9 9 9 8 7 8 7
#> [4033] 3 9 9 10 5 7 7 3 3 3 9 6 9 5 6 2 6 3 3 6 7
#> [4054] 10 2 5 5 6 8 2 8 4 6 9 5 2 5 6 5 6 9 9 8 3
#> [4075] 7 4 6 2 6 6 6 7 9 6 2 9 6 6 10 8 6 2 2 6 3
#> [4096] 8 9 2 6 5 2 8 7 10 6 6 6 7 8 7 8 8 6 10 1 8
#> [4117] 9 9 1 8 3 8 6 6 7 2 6 9 7 9 6 3 3 10 6 9 3
#> [4138] 6 9 6 6 2 3 6 2 9 6 2 7 7 9 2 2 8 3 3 10 9
#> [4159] 6 6 2 9 6 2 7 6 10 6 6 6 3 3 9 7 3 7 9 3 6
#> [4180] 7 8 6 3 2 6 6 3 7 8 6 3 10 3 8 10 6 9 7 10 5
#> [4201] 8 7 2 6 3 9 3 7 9 7 3 6 9 3 3 8 2 6 8 7 6
#> [4222] 8 6 9 2 2 3 9 6 2 3 3 10 2 2 3 9 8 9 10 6 8
#> [4243] 8 9 6 6 9 2 3 3 4 3 6 5 2 3 7 5 7 9 9 6 3
#> [4264] 6 3 3 6 8 7 5 3 8 10 8 6 8 10 8 6 7 9 6 6 6
#> [4285] 9 8 2 8 8 8 5 6 8 6 3 6 8 2 3 10 9 9 3 3 6
#> [4306] 2 2 1 8 6 3 8 9 6 6 6 5 6 6 9 6 7 9 5 5 9
#> [4327] 3 6 7 5 6 6 8 8 2 8 7 6 2 8 9 1 9 3 6 5 6
#> [4348] 6 10 10 5 7 2 7 5 3 8 8 9 3 9 7 6 3 3 2 8 6
#> [4369] 8 7 6 10 3 6 9 8 8 10 6 2 8 9 7 6 2 3 2 6 10
#> [4390] 8 7 8 6 2 9 8 6 6 10 9 7 6 6 6 8 3 9 7 3 3
#> [4411] 8 8 8 9 6 3 8 6 3 5 3 5 7 3 10 7 2 7 7 6 2
#> [4432] 8 2 2 10 6 2 9 6 9 6 8 8 8 2 9 3 7 5 3 6 3
#> [4453] 3 10 9 7 3 6 7 6 10 2 8 6 9 6 6 5 6 6 5 5 5
#> [4474] 1 6 3 10 7 2 2 6 10 9 8 7 9 7 7 8 6 8 9 9 3
#> [4495] 6 6 3 8 3 2 2 6 6 3 4 3 2 9 6 10 2 10 9 8 7
#> [4516] 6 3 9 8 3 9 6 6 6 3 8 3 9 3 7 9 3 3 3 8 10
#> [4537] 3 6 6 3 2 5 2 7 8 8 2 2 9 8 9 9 9 1 6 6 2
#> [4558] 6 3 10 3 8 6 3 6 3 5 6 8 9 9 3 3 5 6 3 8 8
#> [4579] 10 5 6 6 5 9 6 6 8 2 5 6 9 6 2 8 3 2 6 2 6
#> [4600] 6 6 6 9 3 6 7 10 6 3 9 10 10 3 6 6 1 9 7 6 6
#> [4621] 6 6 9 2 9 9 6 8 6 8 4 6 7 6 9 3 7 7 3 3 7
#> [4642] 3 6 3 3 6 2 6 9 7 5 6 9 2 3 7 3 6 5 6 6 6
#> [4663] 6 6 6 8 5 9 6 3 9 6 6 6 6 8 6 2 8 5 3 3 5
#> [4684] 9 8 6 9 2 6 3 9 3 6 6 6 3 3 6 3 7 6 6 6 5
#> [4705] 2 7 6 5 6 8 9 6 9 6 2 8 8 8 10 9 6 7 10 7 1
#> [4726] 9 6 10 9 8 8 5 5 3 3 3 9 10 3 5 6 6 3 10 2 9
#> [4747] 6 7 5 10 2 3 6 3 2 6 2 8 7 10 6 8 5 7 9 6 6
#> [4768] 9 5 2 9 9 3 1 8 2 6 3 7 7 9 7 6 10 6 3 2 10
#> [4789] 6 6 6 6 5 1 1 8 6 9 5 2 6 8 8 2 1 6 6 9 9
#> [4810] 7 3 8 8 3 9 2 6 2 9 8 2 2 7 6 8 2 5 9 6 7
#> [4831] 2 3 3 3 6 8 9 10 10 3 6 6 6 3 4 3 5 5 9 2 2
#> [4852] 2 6 3 2 9 2 7 2 2 8 5 6 6 6 6 5 2 7 6 8 10
#> [4873] 3 5 2 3 1 5 3 9 6 6 6 9 8 5 6 8 5 6 2 7 3
#> [4894] 2 6 7 6 2 7 3 10 9 6 6 6 3 10 7 7 8 6 7 2 3
#> [4915] 3 1 6 6 3 2 2 9 3 3 6 4 6 8 7 3 10 8 3 6 5
#> [4936] 6 8 9 2 10 6 2 7 2 2 6 9 3 6 6 4 6 3 6 3 3
#> [4957] 8 7 10 6 3 9 7 3 3 2 2 3 7 6 3 3 3 6 8 2 2
#> [4978] 3 8 3 8 9 3 6 6 3 8 3 6 5 6 2 10 10 2 9 5 3
#> [4999] 8 8 3 8 8 7 6 6 3 3 5 6 10 6 9 9 2 2 9 2 8
#> [5020] 7 8 6 7 8 9 6 5 2 6 6 5 7 6 6 9 7 6 9 3 6
#> [5041] 3 10 5 2 9 2 6 9 7 10 7 3 3 6 6 3 3 2 9 6 10
#> [5062] 1 3 8 10 8 6 6 6 9 9 6 9 6 3 6 5 2 8 8 7
There seems to be an elbow at the 16th cluster. Silhouette on the other hand indicates 2 clusters.
6 Supervised Learning
6.1 Data Normalisation, splitting and balancing
Creating a new dataframe for analysis purposes, which includes all
nutrition-related features, as well as all ingredient-related features.
We name it recipes_analysis.
nutritional_df <- recipes %>%
select(ID, all_of(nutritional_values))
###### CAREFUL --> recipes_analysis should be of dim 10163 x 33
recipes_analysis <- ingredients_df_full %>%
left_join(nutritional_df, by="ID") %>%
mutate(across(all_of(contains("bin")), as.factor) , ID = as.character(ID)) %>%
select(rating, all_of(nutritional_values), contains("bin"), contains("total"))Creating a normalized version of recipes_analysis where
the continuous numerical values are normalised, to remain consistent
across all models for our analysis.
my_normalise <- function(x) {
(x - min(x)) / (max(x) - min(x))
}
recipes_analysis_normd <- recipes_analysis %>%
mutate(rating = as.factor(rating), across(where(is.numeric), my_normalise))Creating training and test set. We chose a 75/25 split
#creating training and test sets
set.seed(12)
index <- createDataPartition(recipes_analysis_normd$rating, p=0.75, list = FALSE)We create a training set with the original rating
variable with 7 levels.
Below we can see the multilevel rating training set is really unbalanced throughout the classes.
#creating training and test set with multilevel rating variable
train_multi <- recipes_analysis_normd[index, ]
test_multi <- recipes_analysis_normd[-index, ]
table(train_multi$rating)#>
#> 1.25 1.875 2.5 3.125 3.75 4.375 5
#> 64 36 204 624 2187 3480 1029
6.1.1 Train/test sets with variable combinations for multilevel rating
#for train multi
tr_multi_Nut <- train_multi %>%
select(rating, all_of(nutritional_values))
te_multi_Nut <- test_multi %>%
select(rating, all_of(nutritional_values))
tr_multi_NuBi <- train_multi %>%
select(rating, all_of(nutritional_values), all_of(contains("bin")))
te_multi_NuBi <- test_multi %>%
select(rating, all_of(nutritional_values), all_of(contains("bin")))
tr_multi_NuTo <- train_multi %>%
select(rating, all_of(nutritional_values), all_of(contains("total")))
te_multi_NuTo <- test_multi %>%
select(rating, all_of(nutritional_values), all_of(contains("total")))6.1.2 TrainControl Functions
Here we create the train control functions we will use throughout the project.
#creating train control data for unbalanced data
trCtrl <- trainControl(method = "cv",
number = 5)#train control with downsampling
trCtrl_down <- trainControl(method = "cv",
number = 5,
sampling = "down")#train control with downsampling twoclass summary
trCtrl_down_twoClass <- trainControl(method = "cv",
summaryFunction = twoClassSummary,
classProbs = TRUE,
number = 5,
sampling = "down"
)6.2 Testing performance on 7-level rating classification with RF models
Given our correlation results in EDA, we anticipated that a classification with 7 levels will basically be equivalent to randomly classifying each observation in one of the 7 rating levels. However, to make sure this is the case, we try running a random forest on the nutritional values, as well as both the “total” and “bin” ingredients-related features we created during the EDA. We use a random forest for this testing phase because, being an ensemble method, it should give us reasonable results if such results are possible with the data we have. If the results are bad, we can assume that other models will not magically classify everything perfectly.
6.2.1 RF Multilevel Nut
Hyperparameters
Best tune: mtry = 2
Accuracy metrics
- Accuracy 41.6
- Kappa 0.047
6.2.1.1 Fitting
We first fit an RF model with multilevel rating on on the nutritional values.
set.seed(12)
rf_multi_Nut_fit <- train(rating ~ .,
data = tr_multi_Nut,
method = "rf",
preProcess = NULL,
trControl = trCtrl,
tuneLength = 3)
#show the random forest with cv
rf_multi_Nut_fit#> Random Forest
#>
#> 7624 samples
#> 4 predictor
#> 7 classes: '1.25', '1.875', '2.5', '3.125', '3.75', '4.375', '5'
#>
#> No pre-processing
#> Resampling: Cross-Validated (5 fold)
#> Summary of sample sizes: 6100, 6100, 6098, 6099, 6099
#> Resampling results across tuning parameters:
#>
#> mtry Accuracy Kappa
#> 2 0.4122 0.03741
#> 3 0.4073 0.03377
#> 4 0.4083 0.03771
#>
#> Accuracy was used to select the optimal model using the
#> largest value.
#> The final value used for the model was mtry = 2.
plot(rf_multi_Nut_fit, main = "Plot of CV Accuracy relative to Mtry parameter")6.2.2 Predictions
As can be seen by the predictions below, our assumptions were valid.
The model does a really bad job at classifying the test set into the
correct rating categories. It is equivalent to a random model with a
very strong bias towards the majority class at 4.375 (due to the data
imbalance present in the rating variable.)
set.seed(12)
#make predictions
rf_multi_Nut_pred <- predict(rf_multi_Nut_fit, newdata = te_multi_Nut)
#confusion matrix
confusionMatrix(data = rf_multi_Nut_pred, reference = te_multi_Nut$rating)#> Confusion Matrix and Statistics
#>
#> Reference
#> Prediction 1.25 1.875 2.5 3.125 3.75 4.375 5
#> 1.25 0 0 0 0 0 0 0
#> 1.875 0 0 0 0 0 0 0
#> 2.5 0 0 0 0 2 5 0
#> 3.125 0 0 1 7 13 16 8
#> 3.75 4 6 13 53 183 266 72
#> 4.375 15 5 44 134 496 812 209
#> 5 2 1 10 14 35 60 53
#>
#> Overall Statistics
#>
#> Accuracy : 0.416
#> 95% CI : (0.396, 0.435)
#> No Information Rate : 0.456
#> P-Value [Acc > NIR] : 1
#>
#> Kappa : 0.047
#>
#> Mcnemar's Test P-Value : NA
#>
#> Statistics by Class:
#>
#> Class: 1.25 Class: 1.875 Class: 2.5
#> Sensitivity 0.00000 0.00000 0.00000
#> Specificity 1.00000 1.00000 0.99717
#> Pos Pred Value NaN NaN 0.00000
#> Neg Pred Value 0.99173 0.99527 0.97314
#> Prevalence 0.00827 0.00473 0.02678
#> Detection Rate 0.00000 0.00000 0.00000
#> Detection Prevalence 0.00000 0.00000 0.00276
#> Balanced Accuracy 0.50000 0.50000 0.49858
#> Class: 3.125 Class: 3.75 Class: 4.375 Class: 5
#> Sensitivity 0.03365 0.2510 0.701 0.1550
#> Specificity 0.98370 0.7713 0.346 0.9445
#> Pos Pred Value 0.15556 0.3065 0.473 0.3029
#> Neg Pred Value 0.91941 0.7188 0.579 0.8777
#> Prevalence 0.08192 0.2871 0.456 0.1347
#> Detection Rate 0.00276 0.0721 0.320 0.0209
#> Detection Prevalence 0.01772 0.2351 0.675 0.0689
#> Balanced Accuracy 0.50868 0.5111 0.523 0.5497
6.2.3 RF - Multilevel NuBi
6.2.3.1 Fitting
Hyperparameters
Best tune: mtry = 2
Accuracy metrics
- Accuracy 45.5
- Kappa 0.001
set.seed(12)
rf_multi_NuBi_fit <- caret::train(rating ~ .,
data = tr_multi_NuBi,
method = "rf",
preProcess = NULL,
trControl = trCtrl,
tuneLength = 3)
#show the random forest with cv
rf_multi_NuBi_fit#> Random Forest
#>
#> 7624 samples
#> 18 predictor
#> 7 classes: '1.25', '1.875', '2.5', '3.125', '3.75', '4.375', '5'
#>
#> No pre-processing
#> Resampling: Cross-Validated (5 fold)
#> Summary of sample sizes: 6100, 6100, 6098, 6099, 6099
#> Resampling results across tuning parameters:
#>
#> mtry Accuracy Kappa
#> 2 0.4575 0.009305
#> 10 0.4276 0.045279
#> 18 0.4243 0.044598
#>
#> Accuracy was used to select the optimal model using the
#> largest value.
#> The final value used for the model was mtry = 2.
plot(rf_multi_NuBi_fit, main = "Plot of CV Accuracy relative to Mtry parameter")6.2.3.2 Predictions
The RF model with the NuBi combination (nutritional values and binary ingredient categories) also classified almost everything in the 4.375 rating class, leading to an accuracy of 45.6% and a Kappa of 1% which indicates that the model is not better than a NIR model. This bias towards the majority class was confirmed by a sensitivity in that class of 98.27%, while all other classes had a sensitivity of 0 or very close to 0.
set.seed(12)
#make predictions
rf_multi_NuBi_pred <- predict(rf_multi_NuBi_fit, newdata = te_multi_NuBi)
#confusion matrix
confusionMatrix(data = rf_multi_NuBi_pred, reference = te_multi_NuBi$rating)#> Confusion Matrix and Statistics
#>
#> Reference
#> Prediction 1.25 1.875 2.5 3.125 3.75 4.375 5
#> 1.25 0 0 0 0 0 0 0
#> 1.875 0 0 0 0 0 0 0
#> 2.5 0 0 0 0 0 0 0
#> 3.125 0 0 0 0 0 0 0
#> 3.75 1 2 4 5 8 10 7
#> 4.375 19 10 61 201 716 1139 325
#> 5 1 0 3 2 5 10 10
#>
#> Overall Statistics
#>
#> Accuracy : 0.456
#> 95% CI : (0.436, 0.475)
#> No Information Rate : 0.456
#> P-Value [Acc > NIR] : 0.539
#>
#> Kappa : 0.01
#>
#> Mcnemar's Test P-Value : NA
#>
#> Statistics by Class:
#>
#> Class: 1.25 Class: 1.875 Class: 2.5
#> Sensitivity 0.00000 0.00000 0.0000
#> Specificity 1.00000 1.00000 1.0000
#> Pos Pred Value NaN NaN NaN
#> Neg Pred Value 0.99173 0.99527 0.9732
#> Prevalence 0.00827 0.00473 0.0268
#> Detection Rate 0.00000 0.00000 0.0000
#> Detection Prevalence 0.00000 0.00000 0.0000
#> Balanced Accuracy 0.50000 0.50000 0.5000
#> Class: 3.125 Class: 3.75 Class: 4.375 Class: 5
#> Sensitivity 0.0000 0.01097 0.9827 0.02924
#> Specificity 1.0000 0.98398 0.0348 0.99044
#> Pos Pred Value NaN 0.21622 0.4609 0.32258
#> Neg Pred Value 0.9181 0.71183 0.7059 0.86762
#> Prevalence 0.0819 0.28712 0.4565 0.13470
#> Detection Rate 0.0000 0.00315 0.4486 0.00394
#> Detection Prevalence 0.0000 0.01457 0.9732 0.01221
#> Balanced Accuracy 0.5000 0.49748 0.5088 0.50984
6.2.4 RF - Multilevel NuTo
6.2.4.1 Fitting
Hyperparameters
Best tune: mtry = 2
Accuracy metrics
- Accuracy 44.8
- Kappa 0.006
set.seed(12)
rf_multi_NuTo_fit <- caret::train(rating ~ .,
data = tr_multi_NuTo,
method = "rf",
preProcess = NULL,
trControl = trCtrl,
tuneLength = 3)
#show the random forest with cv
rf_multi_NuTo_fit#> Random Forest
#>
#> 7624 samples
#> 18 predictor
#> 7 classes: '1.25', '1.875', '2.5', '3.125', '3.75', '4.375', '5'
#>
#> No pre-processing
#> Resampling: Cross-Validated (5 fold)
#> Summary of sample sizes: 6100, 6100, 6098, 6099, 6099
#> Resampling results across tuning parameters:
#>
#> mtry Accuracy Kappa
#> 2 0.4537 0.01519
#> 10 0.4366 0.05438
#> 18 0.4356 0.05642
#>
#> Accuracy was used to select the optimal model using the
#> largest value.
#> The final value used for the model was mtry = 2.
plot(rf_multi_NuTo_fit, main = "Plot of CV Accuracy relative to Mtry parameter")6.2.4.2 Predictions
The final RF model for multilevel rating trained on the NuTo combination (nutritional values and total ingredient categories) did not perform much differently. It has a slightly lower accuracy than the NuBi model at 44.8% with an even worse Kappa at 0.6%. Once again, most observations are classified in the 4.375 rating class.
set.seed(12)
#make predictions
rf_multi_NuTo_pred <- predict(rf_multi_NuTo_fit, newdata = te_multi_NuTo)
#confusion matrix
confusionMatrix(data = rf_multi_NuTo_pred, reference = te_multi_NuTo$rating)#> Confusion Matrix and Statistics
#>
#> Reference
#> Prediction 1.25 1.875 2.5 3.125 3.75 4.375 5
#> 1.25 0 0 0 0 0 0 0
#> 1.875 0 0 0 0 0 0 0
#> 2.5 0 0 0 0 0 0 0
#> 3.125 0 0 0 0 0 0 0
#> 3.75 1 1 6 10 33 54 14
#> 4.375 20 11 59 196 691 1094 318
#> 5 0 0 3 2 5 11 10
#>
#> Overall Statistics
#>
#> Accuracy : 0.448
#> 95% CI : (0.428, 0.467)
#> No Information Rate : 0.456
#> P-Value [Acc > NIR] : 0.815
#>
#> Kappa : 0.006
#>
#> Mcnemar's Test P-Value : NA
#>
#> Statistics by Class:
#>
#> Class: 1.25 Class: 1.875 Class: 2.5
#> Sensitivity 0.00000 0.00000 0.0000
#> Specificity 1.00000 1.00000 1.0000
#> Pos Pred Value NaN NaN NaN
#> Neg Pred Value 0.99173 0.99527 0.9732
#> Prevalence 0.00827 0.00473 0.0268
#> Detection Rate 0.00000 0.00000 0.0000
#> Detection Prevalence 0.00000 0.00000 0.0000
#> Balanced Accuracy 0.50000 0.50000 0.5000
#> Class: 3.125 Class: 3.75 Class: 4.375 Class: 5
#> Sensitivity 0.0000 0.0453 0.9439 0.02924
#> Specificity 1.0000 0.9525 0.0616 0.99044
#> Pos Pred Value NaN 0.2773 0.4579 0.32258
#> Neg Pred Value 0.9181 0.7124 0.5667 0.86762
#> Prevalence 0.0819 0.2871 0.4565 0.13470
#> Detection Rate 0.0000 0.0130 0.4309 0.00394
#> Detection Prevalence 0.0000 0.0469 0.9409 0.01221
#> Balanced Accuracy 0.5000 0.4989 0.5028 0.50984
6.2.5 Evaluation and decision for rest of analysis
Since we see a huge bias towards the majority class in unbalanced data (which we expected), we have to take a decision between 3 options:
- We could try balancing the 7 levels, either up or down. Due to the very large disparity in observation counts per level (min at 32 and max at 3481), we believe than neither up- nor downsampling would be ideal. Indeed, upsampling would mean creating a massive amount of artificial observations through replacement for the minority classes, while downsampling would leave us with an insufficient total amount of observations.
- We could try to aggregate the minority classes into one single class (i.e., putting 1.25, 1.875, 2.5, and 3.125 together), leaving us with 4 final classes. There would still be a significant imbalance however, and given the near-random classification that we observed above, we do not believe aggregating those classes is the solution.
- We could transform the 7 levels into a binary classification problem. This would help both with class imbalance and would hopefully also lead to higher accuracy metrics by simplifying the classification task.
After careful reflection, we believe that the 3rd options is our best bet. We therefore transform the 7-level rating variable into a binary rating variable, with the threshold at 4. This means that all ratings below 4 are now considered as “bad” and all ratings above 4 are considered as “good”.
6.3 Combinations of 4 Nutritional + 28 Engineered ingredients variables - unbalanced
#function to create the summary tables from the confusion matrices
summary_table_fun <- function(x){
acc <- x[[3]][[1]]
kap <- x[[3]][[2]]
bacc <- x[[4]][[11]]
sens <- x[[4]][[1]]
vec <- c(acc, bacc, sens, kap)
return(vec)
}
#creating empty summary table for unbalanced data
summary_table_unbal <- tibble(metric = c("accuracy", "balanced_accuracy", "sensitivity", "kappa"))6.3.1 Data preparation for binary rating analysis
6.3.1.1 Creating new rating_bin variable
We transform the original 7-level rating variable into a binary
rating_bin variable, “bad” or “good” (i.e., with a
threshold at 4 to decide if the rating is bad or good).
#creating binary rating variable
recipes_analysis_bin <- recipes_analysis %>%
mutate(rating_bin = as.factor(ifelse(rating<4, "bad", "good"))) %>%
select(-rating)
#reversing rating_bin factor order
recipes_analysis_bin$rating_bin <- factor(recipes_analysis_bin$rating_bin, levels=rev(levels(recipes_analysis_bin$rating_bin )))
#normalising newly created df
recipes_analysis_bin <- recipes_analysis_bin %>%
mutate(across(where(is.numeric), my_normalise))6.3.1.2 Creating rating_bin training and test sets
#creating training and test sets
set.seed(12)
index_bin <- createDataPartition(recipes_analysis_bin$rating_bin, p=0.75, list = FALSE)We create a training and test set with the new
rating_bin variable with only 2 levels.
Below we can see the binary rating training set is still slightly unbalanced throughout the two classes.
#creating training and test set with multilevel rating variable
train_bin <- recipes_analysis_bin[index_bin, ]
test_bin <- recipes_analysis_bin[-index_bin, ]
table(train_bin$rating_bin)#>
#> good bad
#> 4508 3115
6.3.1.3 Creating various train/test sets with different variable combinations
#creating train and test set for all combinations of variables, for binary rating df
train_bin_Nut <- train_bin %>%
select(rating_bin, all_of(nutritional_values))
test_bin_Nut <- test_bin %>%
select(rating_bin, all_of(nutritional_values))
train_bin_NuTo <- train_bin %>%
select(rating_bin, all_of(nutritional_values), all_of(contains("total")))
test_bin_NuTo <- test_bin %>%
select(rating_bin, all_of(nutritional_values), all_of(contains("total")))
train_bin_NuBi <- train_bin %>%
select(rating_bin, all_of(nutritional_values), all_of(contains("bin")))
test_bin_NuBi <- test_bin %>%
select(rating_bin, all_of(nutritional_values), all_of(contains("bin")))
train_bin_Tot <- train_bin %>%
select(rating_bin, all_of(contains("total")))
test_bin_Tot <- test_bin %>%
select(rating_bin, all_of(contains("total")))
train_bin_Bin <- train_bin %>%
select(rating_bin, all_of(contains("bin")))
test_bin_Bin <- test_bin %>%
select(rating_bin, all_of(contains("bin")))6.3.2 Logistic Regression - unbalanced - Nut
6.3.2.1 Fitting
We now fit a logreg model on unbalanced data with only the nutritional variables.
set.seed(12)
logr_Nut_unbal_fit <- caret::train(rating_bin ~.,
data = train_bin_Nut,
method = "glm",
family = "binomial",
trControl = trCtrl)
logr_Nut_unbal_fit#> Generalized Linear Model
#>
#> 7623 samples
#> 4 predictor
#> 2 classes: 'good', 'bad'
#>
#> No pre-processing
#> Resampling: Cross-Validated (5 fold)
#> Summary of sample sizes: 6099, 6099, 6098, 6098, 6098
#> Resampling results:
#>
#> Accuracy Kappa
#> 0.5914 0
6.3.2.2 Predictions
We see that the relatively high accuracy is achieved by classifying all observations as good, resulting in a perfect sensitivity but a 0 specificity and a kappa of 0.
logr_Nut_unbal_pred <- predict(logr_Nut_unbal_fit, newdata = test_bin_Nut, type = "raw")
CM <- confusionMatrix(reference = test_bin_Nut$rating_bin, data = logr_Nut_unbal_pred, positive="good")
CM#> Confusion Matrix and Statistics
#>
#> Reference
#> Prediction good bad
#> good 1502 1038
#> bad 0 0
#>
#> Accuracy : 0.591
#> 95% CI : (0.572, 0.611)
#> No Information Rate : 0.591
#> P-Value [Acc > NIR] : 0.509
#>
#> Kappa : 0
#>
#> Mcnemar's Test P-Value : <2e-16
#>
#> Sensitivity : 1.000
#> Specificity : 0.000
#> Pos Pred Value : 0.591
#> Neg Pred Value : NaN
#> Prevalence : 0.591
#> Detection Rate : 0.591
#> Detection Prevalence : 1.000
#> Balanced Accuracy : 0.500
#>
#> 'Positive' Class : good
#>
summary_table_unbal <- cbind(summary_table_unbal, LogReg_Nut = summary_table_fun(CM))6.3.3 Logistic Regression - unbalanced - NuBi
6.3.3.1 Fitting
We now fit a logreg model on unbalanced data with the NuBi variable combination.
set.seed(12)
logr_NuBi_unbal_fit <- train(rating_bin ~.,
data = train_bin_NuBi,
method = "glm",
family = "binomial",
trControl = trCtrl)
logr_NuBi_unbal_fit#> Generalized Linear Model
#>
#> 7623 samples
#> 18 predictor
#> 2 classes: 'good', 'bad'
#>
#> No pre-processing
#> Resampling: Cross-Validated (5 fold)
#> Summary of sample sizes: 6099, 6099, 6098, 6098, 6098
#> Resampling results:
#>
#> Accuracy Kappa
#> 0.5971 0.04562
6.3.3.2 Predictions
The accuracy here is slightly lower at 58.5%, however, the model now is a bit less biased towards the majority class and classifies at least some obs as bad. Kappa is still very low at 1.9%
logr_NuBi_unbal_pred <- predict(logr_NuBi_unbal_fit, newdata = test_bin_NuBi, type = "raw")
CM <- confusionMatrix(reference = test_bin_NuBi$rating_bin, data = logr_NuBi_unbal_pred, positive="good")
CM#> Confusion Matrix and Statistics
#>
#> Reference
#> Prediction good bad
#> good 1399 950
#> bad 103 88
#>
#> Accuracy : 0.585
#> 95% CI : (0.566, 0.605)
#> No Information Rate : 0.591
#> P-Value [Acc > NIR] : 0.734
#>
#> Kappa : 0.019
#>
#> Mcnemar's Test P-Value : <2e-16
#>
#> Sensitivity : 0.9314
#> Specificity : 0.0848
#> Pos Pred Value : 0.5956
#> Neg Pred Value : 0.4607
#> Prevalence : 0.5913
#> Detection Rate : 0.5508
#> Detection Prevalence : 0.9248
#> Balanced Accuracy : 0.5081
#>
#> 'Positive' Class : good
#>
summary_table_unbal <- cbind(summary_table_unbal, LogReg_NuBi = summary_table_fun(CM))6.3.4 Logistic Regression - unbalanced - NuTo
6.3.4.1 Fitting
We now fit a logreg model on unbalanced data with the NuTo variable combination.
set.seed(12)
logr_NuTo_unbal_fit <- train(rating_bin ~.,
data = train_bin_NuTo,
method = "glm",
family = "binomial",
trControl = trCtrl)
logr_NuTo_unbal_fit#> Generalized Linear Model
#>
#> 7623 samples
#> 18 predictor
#> 2 classes: 'good', 'bad'
#>
#> No pre-processing
#> Resampling: Cross-Validated (5 fold)
#> Summary of sample sizes: 6099, 6099, 6098, 6098, 6098
#> Resampling results:
#>
#> Accuracy Kappa
#> 0.595 0.03911
6.3.4.2 Predictions
Accuracy is slightly higher than with the NuBi combination, at 58.9%. The sensitivity is very similar and the kappa is slighty higher at 2.9%
logr_NuTo_unbal_pred <- predict(logr_NuTo_unbal_fit, newdata = test_bin_NuTo, type = "raw")
CM <- confusionMatrix(reference = test_bin_NuTo$rating_bin, data = logr_NuTo_unbal_pred, positive="good")
CM#> Confusion Matrix and Statistics
#>
#> Reference
#> Prediction good bad
#> good 1401 942
#> bad 101 96
#>
#> Accuracy : 0.589
#> 95% CI : (0.57, 0.609)
#> No Information Rate : 0.591
#> P-Value [Acc > NIR] : 0.588
#>
#> Kappa : 0.029
#>
#> Mcnemar's Test P-Value : <2e-16
#>
#> Sensitivity : 0.9328
#> Specificity : 0.0925
#> Pos Pred Value : 0.5980
#> Neg Pred Value : 0.4873
#> Prevalence : 0.5913
#> Detection Rate : 0.5516
#> Detection Prevalence : 0.9224
#> Balanced Accuracy : 0.5126
#>
#> 'Positive' Class : good
#>
summary_table_unbal <- cbind(summary_table_unbal, LogReg_NuTo = summary_table_fun(CM))Out of the 3 LogReg models with unbalanced data, the NuTo model seems to perform the best. ### KNN - unbalanced - Nut
6.3.4.3 Fitting
We fit a KNN model on unbalanced data with only the nutritional variables. The best hyperparameter K if we look at both accuracy and Kappa is 113.
set.seed(12)
knn_Nut_unbal_fit <- caret::train(rating_bin ~.,
data = train_bin_Nut,
method = "knn",
trControl = trCtrl,
tuneGrid= data.frame(k = seq(105, 115,by = 2))#initially checked on 1-151 range, best was 113
)
knn_Nut_unbal_fit#> k-Nearest Neighbors
#>
#> 7623 samples
#> 4 predictor
#> 2 classes: 'good', 'bad'
#>
#> No pre-processing
#> Resampling: Cross-Validated (5 fold)
#> Summary of sample sizes: 6099, 6099, 6098, 6098, 6098
#> Resampling results across tuning parameters:
#>
#> k Accuracy Kappa
#> 105 0.5890 0.03661
#> 107 0.5876 0.03221
#> 109 0.5881 0.03268
#> 111 0.5885 0.03322
#> 113 0.5895 0.03570
#> 115 0.5881 0.03235
#>
#> Accuracy was used to select the optimal model using the
#> largest value.
#> The final value used for the model was k = 113.
6.3.4.4 Predictions
We see that the relatively high accuracy is achieved by classifying most obs as good, resulting in a high sensitivity but a very poor specificity.
knn_Nut_unbal_pred <- predict(knn_Nut_unbal_fit, newdata = test_bin_Nut, type = "raw")
CM <- confusionMatrix(reference = test_bin_Nut$rating_bin, data = knn_Nut_unbal_pred, positive="good")
CM#> Confusion Matrix and Statistics
#>
#> Reference
#> Prediction good bad
#> good 1382 903
#> bad 120 135
#>
#> Accuracy : 0.597
#> 95% CI : (0.578, 0.616)
#> No Information Rate : 0.591
#> P-Value [Acc > NIR] : 0.279
#>
#> Kappa : 0.057
#>
#> Mcnemar's Test P-Value : <2e-16
#>
#> Sensitivity : 0.920
#> Specificity : 0.130
#> Pos Pred Value : 0.605
#> Neg Pred Value : 0.529
#> Prevalence : 0.591
#> Detection Rate : 0.544
#> Detection Prevalence : 0.900
#> Balanced Accuracy : 0.525
#>
#> 'Positive' Class : good
#>
summary_table_unbal <- cbind(summary_table_unbal, KNN_Nut = summary_table_fun(CM))6.3.5 KNN - unbalanced - NuBi
6.3.5.1 Fitting
We fit a KNN model on unbalanced data with the NuBi variable combination. The best hyperparameter K if we look at both accuracy and Kappa is 135
set.seed(12)
knn_NuBi_unbal_fit <- caret::train(rating_bin ~.,
data = train_bin_NuBi,
method = "knn",
trControl = trCtrl,
tuneGrid= data.frame(k = seq(131, 141,by = 2))#initially checked on 1-151 range, best was 135
)
knn_NuBi_unbal_fit#> k-Nearest Neighbors
#>
#> 7623 samples
#> 18 predictor
#> 2 classes: 'good', 'bad'
#>
#> No pre-processing
#> Resampling: Cross-Validated (5 fold)
#> Summary of sample sizes: 6099, 6099, 6098, 6098, 6098
#> Resampling results across tuning parameters:
#>
#> k Accuracy Kappa
#> 131 0.5954 0.04616
#> 133 0.5936 0.04216
#> 135 0.5935 0.04255
#> 137 0.5946 0.04432
#> 139 0.5948 0.04454
#> 141 0.5943 0.04332
#>
#> Accuracy was used to select the optimal model using the
#> largest value.
#> The final value used for the model was k = 131.
6.3.5.2 Predictions
Once again most obs are classified as good. Not better than NIR with an accuracy of 58.4 but balanced accuracy of 51.3.
knn_NuBi_unbal_pred <- predict(knn_NuBi_unbal_fit, newdata = test_bin_NuBi, type = "raw")
CM <- confusionMatrix(reference = test_bin_NuBi$rating_bin, data = knn_NuBi_unbal_pred, positive="good")
CM#> Confusion Matrix and Statistics
#>
#> Reference
#> Prediction good bad
#> good 1353 908
#> bad 149 130
#>
#> Accuracy : 0.584
#> 95% CI : (0.564, 0.603)
#> No Information Rate : 0.591
#> P-Value [Acc > NIR] : 0.785
#>
#> Kappa : 0.029
#>
#> Mcnemar's Test P-Value : <2e-16
#>
#> Sensitivity : 0.901
#> Specificity : 0.125
#> Pos Pred Value : 0.598
#> Neg Pred Value : 0.466
#> Prevalence : 0.591
#> Detection Rate : 0.533
#> Detection Prevalence : 0.890
#> Balanced Accuracy : 0.513
#>
#> 'Positive' Class : good
#>
summary_table_unbal <- cbind(summary_table_unbal, KNN_NuBi = summary_table_fun(CM))6.3.6 KNN - unbalanced - NuTo
6.3.6.1 Fitting
We fit a KNN model on unbalanced data with the NuTo variable combination. The best hyperparameter K if we look at both accuracy and Kappa is 147.
set.seed(12)
knn_NuTo_unbal_fit <- caret::train(rating_bin ~.,
data = train_bin_NuTo,
method = "knn",
trControl = trCtrl,
tuneGrid= data.frame(k = seq(141, 151,by = 2))#initially checked on 1-151 range, best was 147
)
knn_NuTo_unbal_fit#> k-Nearest Neighbors
#>
#> 7623 samples
#> 18 predictor
#> 2 classes: 'good', 'bad'
#>
#> No pre-processing
#> Resampling: Cross-Validated (5 fold)
#> Summary of sample sizes: 6099, 6099, 6098, 6098, 6098
#> Resampling results across tuning parameters:
#>
#> k Accuracy Kappa
#> 141 0.5932 0.04621
#> 143 0.5943 0.04856
#> 145 0.5964 0.05284
#> 147 0.5966 0.05365
#> 149 0.5962 0.05279
#> 151 0.5965 0.05318
#>
#> Accuracy was used to select the optimal model using the
#> largest value.
#> The final value used for the model was k = 147.
6.3.6.2 Predictions
We see a balanced accuracy of 51.8 and still a strong bias towards the positive class.
knn_NuTo_unbal_pred <- predict(knn_NuTo_unbal_fit, newdata = test_bin_NuTo, type = "raw")
CM <- confusionMatrix(reference = test_bin_NuTo$rating_bin, data = knn_NuTo_unbal_pred, positive="good")
CM#> Confusion Matrix and Statistics
#>
#> Reference
#> Prediction good bad
#> good 1370 909
#> bad 132 129
#>
#> Accuracy : 0.59
#> 95% CI : (0.571, 0.609)
#> No Information Rate : 0.591
#> P-Value [Acc > NIR] : 0.557
#>
#> Kappa : 0.041
#>
#> Mcnemar's Test P-Value : <2e-16
#>
#> Sensitivity : 0.912
#> Specificity : 0.124
#> Pos Pred Value : 0.601
#> Neg Pred Value : 0.494
#> Prevalence : 0.591
#> Detection Rate : 0.539
#> Detection Prevalence : 0.897
#> Balanced Accuracy : 0.518
#>
#> 'Positive' Class : good
#>
summary_table_unbal <- cbind(summary_table_unbal, KNN_NuTo = summary_table_fun(CM))6.3.7 CART - unbalanced - Nut
6.3.7.1 Fitting
Fitting a CART model on only nutritinal variables.
#CART_grid <- expand.grid(cp = c(0.001,0.005,0.01, 0.015, 0.02, 0.05, 0.1, 0.15, 0.2)) --> confirmed that highest kappa is at cp selected by caret
set.seed(12)
cart_Nut_unbal_fit <- train(rating_bin ~ .,
data = train_bin_Nut,
method = "rpart",
trControl = trCtrl,
tuneLength = 10)
cart_Nut_unbal_fit#> CART
#>
#> 7623 samples
#> 4 predictor
#> 2 classes: 'good', 'bad'
#>
#> No pre-processing
#> Resampling: Cross-Validated (5 fold)
#> Summary of sample sizes: 6099, 6099, 6098, 6098, 6098
#> Resampling results across tuning parameters:
#>
#> cp Accuracy Kappa
#> 0.001284 0.5838 0.05640
#> 0.001445 0.5863 0.05858
#> 0.001605 0.5891 0.05943
#> 0.001734 0.5883 0.05246
#> 0.001819 0.5880 0.05409
#> 0.001926 0.5880 0.05189
#> 0.002033 0.5893 0.05045
#> 0.002568 0.5872 0.02484
#> 0.002729 0.5911 0.02326
#> 0.003050 0.5932 0.01641
#>
#> Accuracy was used to select the optimal model using the
#> largest value.
#> The final value used for the model was cp = 0.00305.
The best CP is 0.00305 before reaching a plateau. with higher values of CP. It might seem like the CP chosen by caret is not optimal since the accuracy curve continues to go up, however, we confirmed by using a manual tuning grid that this CP was indeed the best when combining the accuracy with the Kappa metric.
# Get the results for each CP step
cart_Nut_unbal_results <- cart_Nut_unbal_fit$results
cart_Nut_unbal_results %>%
ggplot(aes(x=cp, y=Accuracy)) +
geom_line()+
labs(x = "CP Parameter", y = "Accuracy", title = "CP Plot")6.3.7.2 Predictions
Model predicts only positive class.
#Now using the cv to predict test set
cart_Nut_unbal_pred <- predict(cart_Nut_unbal_fit, newdata = test_bin_Nut)
#And looking at the confusion matrix for the predictions using the cv
CM <- confusionMatrix(reference = test_bin_Nut$rating_bin, data = cart_Nut_unbal_pred)
CM#> Confusion Matrix and Statistics
#>
#> Reference
#> Prediction good bad
#> good 1502 1038
#> bad 0 0
#>
#> Accuracy : 0.591
#> 95% CI : (0.572, 0.611)
#> No Information Rate : 0.591
#> P-Value [Acc > NIR] : 0.509
#>
#> Kappa : 0
#>
#> Mcnemar's Test P-Value : <2e-16
#>
#> Sensitivity : 1.000
#> Specificity : 0.000
#> Pos Pred Value : 0.591
#> Neg Pred Value : NaN
#> Prevalence : 0.591
#> Detection Rate : 0.591
#> Detection Prevalence : 1.000
#> Balanced Accuracy : 0.500
#>
#> 'Positive' Class : good
#>
summary_table_unbal <- cbind(summary_table_unbal, CART_Nut = summary_table_fun(CM))6.3.8 CART - unbalanced - NuBi
6.3.8.1 Fitting
Fitting a CART model on the NuBi combination.
#CART_grid <- expand.grid(cp = c(0.001,0.005,0.01, 0.015, 0.02, 0.05, 0.1, 0.15, 0.2)) --> confirmed that highest kappa is at cp selected by caret
set.seed(12)
cart_NuBi_unbal_fit <- train(rating_bin ~ .,
data = train_bin_NuBi,
method = "rpart",
trControl = trCtrl,
tuneLength = 10)
cart_NuBi_unbal_fit#> CART
#>
#> 7623 samples
#> 18 predictor
#> 2 classes: 'good', 'bad'
#>
#> No pre-processing
#> Resampling: Cross-Validated (5 fold)
#> Summary of sample sizes: 6099, 6099, 6098, 6098, 6098
#> Resampling results across tuning parameters:
#>
#> cp Accuracy Kappa
#> 0.001284 0.5839 0.049008
#> 0.001364 0.5840 0.048006
#> 0.001445 0.5838 0.047922
#> 0.001498 0.5830 0.045608
#> 0.001846 0.5852 0.049754
#> 0.002354 0.5859 0.043717
#> 0.002889 0.5877 0.046967
#> 0.003210 0.5877 0.043065
#> 0.005457 0.5915 0.020972
#> 0.007223 0.5894 0.009009
#>
#> Accuracy was used to select the optimal model using the
#> largest value.
#> The final value used for the model was cp = 0.005457.
Best CP of 0.0055
# Get the results for each CP step
cart_NuBi_unbal_results <- cart_NuBi_unbal_fit$results
cart_NuBi_unbal_results %>%
ggplot(aes(x=cp, y=Accuracy)) +
geom_line()+
labs(x = "CP Parameter", y = "Accuracy", title = "CP Plot")6.3.8.2 Predictions
Balanced accuracy is higher than with the CART model including only the Nutritional values.
#Now using the cv to predict test set
cart_NuBi_unbal_pred <- predict(cart_NuBi_unbal_fit, newdata = test_bin_NuBi)
#And looking at the confusion matrix for the predictions using the cv
CM <- confusionMatrix(reference = test_bin_NuBi$rating_bin, data = cart_NuBi_unbal_pred)
CM#> Confusion Matrix and Statistics
#>
#> Reference
#> Prediction good bad
#> good 1377 919
#> bad 125 119
#>
#> Accuracy : 0.589
#> 95% CI : (0.57, 0.608)
#> No Information Rate : 0.591
#> P-Value [Acc > NIR] : 0.604
#>
#> Kappa : 0.036
#>
#> Mcnemar's Test P-Value : <2e-16
#>
#> Sensitivity : 0.917
#> Specificity : 0.115
#> Pos Pred Value : 0.600
#> Neg Pred Value : 0.488
#> Prevalence : 0.591
#> Detection Rate : 0.542
#> Detection Prevalence : 0.904
#> Balanced Accuracy : 0.516
#>
#> 'Positive' Class : good
#>
summary_table_unbal <- cbind(summary_table_unbal, CART_NuBi = summary_table_fun(CM))6.3.9 CART - unbalanced - NuTo
6.3.9.1 Fitting
We now fit a CART with the NuTo variable combination
#CART_grid <- expand.grid(cp = c(0.001,0.005,0.01, 0.015, 0.02, 0.05, 0.1, 0.15, 0.2)) --> confirmed that highest kappa is at cp selected by caret
set.seed(12)
cart_NuTo_unbal_fit <- train(rating_bin ~ .,
data = train_bin_NuTo,
method = "rpart",
trControl = trCtrl,
tuneLength = 10)
cart_NuTo_unbal_fit#> CART
#>
#> 7623 samples
#> 18 predictor
#> 2 classes: 'good', 'bad'
#>
#> No pre-processing
#> Resampling: Cross-Validated (5 fold)
#> Summary of sample sizes: 6099, 6099, 6098, 6098, 6098
#> Resampling results across tuning parameters:
#>
#> cp Accuracy Kappa
#> 0.001445 0.5878 0.06262
#> 0.001498 0.5861 0.05752
#> 0.001605 0.5881 0.05903
#> 0.001766 0.5885 0.06038
#> 0.001926 0.5870 0.05717
#> 0.002247 0.5876 0.04742
#> 0.002408 0.5886 0.04678
#> 0.002568 0.5886 0.04678
#> 0.003050 0.5881 0.05055
#> 0.004173 0.5911 0.02112
#>
#> Accuracy was used to select the optimal model using the
#> largest value.
#> The final value used for the model was cp = 0.004173.
Again, it might seem like the CP chosen by caret is not optimal since the accuracy curve continues to go up, however, we confirmed by using a manual tuning grid that this CP was indeed the best when combining the accuracy with the Kappa metric.
# Get the results for each CP step
cart_NuTo_unbal_results <- cart_NuTo_unbal_fit$results
cart_NuTo_unbal_results %>%
ggplot(aes(x=cp, y=Accuracy)) +
geom_line()+
labs(x = "CP Parameter", y = "Accuracy", title = "CP Plot")6.3.9.2 Predictions
The model once again classifies everything as good due to the strong majority class bias.
#Now using the cv to predict test set
cart_NuTo_unbal_pred <- predict(cart_NuTo_unbal_fit, newdata = test_bin_NuTo)
#And looking at the confusion matrix for the predictions using the cv
CM <- confusionMatrix(reference = test_bin_NuTo$rating_bin, data = cart_NuTo_unbal_pred)
CM#> Confusion Matrix and Statistics
#>
#> Reference
#> Prediction good bad
#> good 1502 1038
#> bad 0 0
#>
#> Accuracy : 0.591
#> 95% CI : (0.572, 0.611)
#> No Information Rate : 0.591
#> P-Value [Acc > NIR] : 0.509
#>
#> Kappa : 0
#>
#> Mcnemar's Test P-Value : <2e-16
#>
#> Sensitivity : 1.000
#> Specificity : 0.000
#> Pos Pred Value : 0.591
#> Neg Pred Value : NaN
#> Prevalence : 0.591
#> Detection Rate : 0.591
#> Detection Prevalence : 1.000
#> Balanced Accuracy : 0.500
#>
#> 'Positive' Class : good
#>
summary_table_unbal <- cbind(summary_table_unbal, CART_NuTo = summary_table_fun(CM))6.4 Combinations of 4 Nutritional + 28 Engineered ingredients variables - balanced
#creating empty summary table for balanced data
summary_table_down <- tibble(metric = c("accuracy", "balanced_accuracy", "sensitivity", "kappa"))6.4.1 Random Forest - balanced - Nut
Hyperparameters
Best tune: mtry = 2
Accuracy metrics
- Balanced Accuracy 53.7
- Sensitivity 53.7
- Kappa 7.2
6.4.1.1 Fitting
set.seed(12)
rf_Nut_fit <- caret::train(rating_bin ~ .,
data = train_bin_Nut,
method = "rf",
trControl = trCtrl_down,
tuneLength = 10)#> note: only 3 unique complexity parameters in default grid. Truncating the grid to 3 .
#show the random forest with cv
rf_Nut_fit#> Random Forest
#>
#> 7623 samples
#> 4 predictor
#> 2 classes: 'good', 'bad'
#>
#> No pre-processing
#> Resampling: Cross-Validated (5 fold)
#> Summary of sample sizes: 6099, 6099, 6098, 6098, 6098
#> Addtional sampling using down-sampling
#>
#> Resampling results across tuning parameters:
#>
#> mtry Accuracy Kappa
#> 2 0.5310 0.06036
#> 3 0.5247 0.05028
#> 4 0.5229 0.04546
#>
#> Accuracy was used to select the optimal model using the
#> largest value.
#> The final value used for the model was mtry = 2.
plot(rf_Nut_fit, main = "Plot of CV Accuracy relative to Mtry parameter")6.4.1.2 Predictions
We see a balanced accuracy and sensitivity both equal to 53.7.
set.seed(12)
#make predictions
rf_Nut_pred <- predict(rf_Nut_fit, newdata = test_bin_Nut)
#confusion matrix
CM <- confusionMatrix(data = rf_Nut_pred, reference = test_bin_Nut$rating_bin)
CM#> Confusion Matrix and Statistics
#>
#> Reference
#> Prediction good bad
#> good 806 480
#> bad 696 558
#>
#> Accuracy : 0.537
#> 95% CI : (0.517, 0.557)
#> No Information Rate : 0.591
#> P-Value [Acc > NIR] : 1
#>
#> Kappa : 0.072
#>
#> Mcnemar's Test P-Value : 3.62e-10
#>
#> Sensitivity : 0.537
#> Specificity : 0.538
#> Pos Pred Value : 0.627
#> Neg Pred Value : 0.445
#> Prevalence : 0.591
#> Detection Rate : 0.317
#> Detection Prevalence : 0.506
#> Balanced Accuracy : 0.537
#>
#> 'Positive' Class : good
#>
summary_table_down <- cbind(summary_table_down, RF_Nut = summary_table_fun(CM))6.4.2 Random Forest - balanced - NuBi
Hyperparameters
Best tune: mtry = 3
Accuracy metrics
- Balanced accuracy 54.2
- Sensitivity 57.7
- Kappa 8.20
6.4.2.1 Fitting
The best mtry is 3.
set.seed(12)
rf_NuBi_fit <- caret::train(rating_bin ~ .,
data = train_bin_NuBi,
method = "rf",
trControl = trCtrl_down,
tuneLength = 10)
#show the random forest with cv
rf_NuBi_fit#> Random Forest
#>
#> 7623 samples
#> 18 predictor
#> 2 classes: 'good', 'bad'
#>
#> No pre-processing
#> Resampling: Cross-Validated (5 fold)
#> Summary of sample sizes: 6099, 6099, 6098, 6098, 6098
#> Addtional sampling using down-sampling
#>
#> Resampling results across tuning parameters:
#>
#> mtry Accuracy Kappa
#> 2 0.5512 0.09826
#> 3 0.5569 0.10237
#> 5 0.5490 0.09099
#> 7 0.5452 0.08598
#> 9 0.5472 0.08937
#> 10 0.5478 0.08876
#> 12 0.5482 0.09350
#> 14 0.5437 0.08312
#> 16 0.5403 0.07545
#> 18 0.5423 0.08035
#>
#> Accuracy was used to select the optimal model using the
#> largest value.
#> The final value used for the model was mtry = 3.
plot(rf_NuBi_fit, main = "Plot of CV Accuracy relative to Mtry parameter")6.4.2.2 Predictions
The balanced accuracy and sensitivity are higher than for the model with only nutritional data.
set.seed(12)
#make predictions
rf_NuBi_pred <- predict(rf_NuBi_fit, newdata = test_bin_NuBi)
#confusion matrix
CM <- confusionMatrix(data = rf_NuBi_pred, reference = test_bin_NuBi$rating_bin)
CM#> Confusion Matrix and Statistics
#>
#> Reference
#> Prediction good bad
#> good 867 513
#> bad 635 525
#>
#> Accuracy : 0.548
#> 95% CI : (0.528, 0.568)
#> No Information Rate : 0.591
#> P-Value [Acc > NIR] : 0.999995
#>
#> Kappa : 0.082
#>
#> Mcnemar's Test P-Value : 0.000355
#>
#> Sensitivity : 0.577
#> Specificity : 0.506
#> Pos Pred Value : 0.628
#> Neg Pred Value : 0.453
#> Prevalence : 0.591
#> Detection Rate : 0.341
#> Detection Prevalence : 0.543
#> Balanced Accuracy : 0.542
#>
#> 'Positive' Class : good
#>
summary_table_down <- cbind(summary_table_down, RF_NuBi = summary_table_fun(CM))6.4.3 Random Forest - balanced - NuTo
Hyperparameters
Best tune: mtry = 2
Accuracy metrics
- Balanced accuracy 55.6
- Sensitivity 59.1
- Kappa 11.1
6.4.3.1 Fitting
set.seed(12)
rf_NuTo_fit <- caret::train(rating_bin ~ .,
data = train_bin_NuTo,
method = "rf",
trControl = trCtrl_down,
tuneLength = 10)
#show the random forest with cv
rf_NuTo_fit#> Random Forest
#>
#> 7623 samples
#> 18 predictor
#> 2 classes: 'good', 'bad'
#>
#> No pre-processing
#> Resampling: Cross-Validated (5 fold)
#> Summary of sample sizes: 6099, 6099, 6098, 6098, 6098
#> Addtional sampling using down-sampling
#>
#> Resampling results across tuning parameters:
#>
#> mtry Accuracy Kappa
#> 2 0.5586 0.10858
#> 3 0.5545 0.09957
#> 5 0.5515 0.09524
#> 7 0.5511 0.09567
#> 9 0.5498 0.09433
#> 10 0.5521 0.09823
#> 12 0.5546 0.10354
#> 14 0.5468 0.08888
#> 16 0.5504 0.09405
#> 18 0.5482 0.09194
#>
#> Accuracy was used to select the optimal model using the
#> largest value.
#> The final value used for the model was mtry = 2.
plot(rf_NuTo_fit, main = "Plot of CV Accuracy relative to Mtry parameter")6.4.3.2 Predictions
The balanced accuracy has increased again compared to the NuBi model, now sitting at 55.6. We also reached the highest sensitivity of the 3 RF models tested so far at 59.1.
set.seed(12)
#make predictions
rf_NuTo_pred <- predict(rf_NuTo_fit, newdata = test_bin_NuTo)
#confusion matrix
CM <- confusionMatrix(data = rf_NuTo_pred, reference = test_bin_NuTo$rating_bin)
CM#> Confusion Matrix and Statistics
#>
#> Reference
#> Prediction good bad
#> good 887 496
#> bad 615 542
#>
#> Accuracy : 0.563
#> 95% CI : (0.543, 0.582)
#> No Information Rate : 0.591
#> P-Value [Acc > NIR] : 0.9985
#>
#> Kappa : 0.111
#>
#> Mcnemar's Test P-Value : 0.0004
#>
#> Sensitivity : 0.591
#> Specificity : 0.522
#> Pos Pred Value : 0.641
#> Neg Pred Value : 0.468
#> Prevalence : 0.591
#> Detection Rate : 0.349
#> Detection Prevalence : 0.544
#> Balanced Accuracy : 0.556
#>
#> 'Positive' Class : good
#>
summary_table_down <- cbind(summary_table_down, RF_NuTo = summary_table_fun(CM))6.4.4 Random Forest - balanced - Tot
Hyperparameters
Best tune: mtry = 3
Accuracy metrics
- Balanced Accuracy 52.4
- Sensitivity 52.9
- Kappa 4.7
6.4.4.1 Fitting
set.seed(12)
rf_Tot_fit <- caret::train(rating_bin ~ .,
data = train_bin_Tot,
method = "rf",
preProcess = NULL,
trControl = trCtrl_down,
tuneLength = 10)
#show the random forest with cv
rf_Tot_fit#> Random Forest
#>
#> 7623 samples
#> 14 predictor
#> 2 classes: 'good', 'bad'
#>
#> No pre-processing
#> Resampling: Cross-Validated (5 fold)
#> Summary of sample sizes: 6099, 6099, 6098, 6098, 6098
#> Addtional sampling using down-sampling
#>
#> Resampling results across tuning parameters:
#>
#> mtry Accuracy Kappa
#> 2 0.5334 0.06957
#> 3 0.5407 0.07430
#> 4 0.5352 0.06709
#> 6 0.5357 0.06395
#> 7 0.5309 0.05548
#> 8 0.5292 0.04827
#> 10 0.5291 0.05021
#> 11 0.5294 0.04991
#> 12 0.5283 0.04822
#> 14 0.5298 0.04841
#>
#> Accuracy was used to select the optimal model using the
#> largest value.
#> The final value used for the model was mtry = 3.
plot(rf_Tot_fit, main = "Plot of CV Accuracy relative to Mtry parameter")6.4.4.2 Predictions
Both balanced accuracy and sensitivity are lower then for the NuTo model. No improvements here.
set.seed(12)
#make predictions
rf_Tot_pred <- predict(rf_Tot_fit, newdata = test_bin_Tot)
#confusion matrix
CM <- confusionMatrix(data = rf_Tot_pred, reference = test_bin_Tot$rating_bin)
CM#> Confusion Matrix and Statistics
#>
#> Reference
#> Prediction good bad
#> good 794 498
#> bad 708 540
#>
#> Accuracy : 0.525
#> 95% CI : (0.506, 0.545)
#> No Information Rate : 0.591
#> P-Value [Acc > NIR] : 1
#>
#> Kappa : 0.047
#>
#> Mcnemar's Test P-Value : 1.76e-09
#>
#> Sensitivity : 0.529
#> Specificity : 0.520
#> Pos Pred Value : 0.615
#> Neg Pred Value : 0.433
#> Prevalence : 0.591
#> Detection Rate : 0.313
#> Detection Prevalence : 0.509
#> Balanced Accuracy : 0.524
#>
#> 'Positive' Class : good
#>
#adding results to summary table
summary_table_down <- cbind(summary_table_down, RF_Tot = summary_table_fun(CM))6.4.5 Random Forest - balanced - Bin
Hyperparameters
Best tune: mtry = 2
Accuracy metrics
- Balanced accuracy 52.6
- Sensitivity 55.7
- Kappa 5.1
6.4.5.1 Fitting
set.seed(12)
rf_Bin_fit <- caret::train(rating_bin ~ .,
data = train_bin_Bin,
method = "rf",
preProcess = NULL,
trControl = trCtrl_down,
tuneLength = 10)
#show the random forest with cv
rf_Bin_fit#> Random Forest
#>
#> 7623 samples
#> 14 predictor
#> 2 classes: 'good', 'bad'
#>
#> No pre-processing
#> Resampling: Cross-Validated (5 fold)
#> Summary of sample sizes: 6099, 6099, 6098, 6098, 6098
#> Addtional sampling using down-sampling
#>
#> Resampling results across tuning parameters:
#>
#> mtry Accuracy Kappa
#> 2 0.5384 0.07264
#> 3 0.5297 0.05937
#> 4 0.5309 0.06323
#> 6 0.5317 0.06704
#> 7 0.5277 0.05938
#> 8 0.5263 0.05836
#> 10 0.5251 0.05631
#> 11 0.5258 0.05787
#> 12 0.5331 0.06577
#> 14 0.5310 0.05715
#>
#> Accuracy was used to select the optimal model using the
#> largest value.
#> The final value used for the model was mtry = 2.
plot(rf_Bin_fit, main = "Plot of CV Accuracy relative to Mtry parameter")6.4.5.2 Predictions
We have higher performance. than with the model using only Tot, but still not higher than NuTO
set.seed(12)
#make predictions
rf_Bin_pred <- predict(rf_Bin_fit, newdata = test_bin_Bin)
#confusion matrix
CM <- confusionMatrix(data = rf_Bin_pred, reference = test_bin_Bin$rating_bin)
CM#> Confusion Matrix and Statistics
#>
#> Reference
#> Prediction good bad
#> good 837 524
#> bad 665 514
#>
#> Accuracy : 0.532
#> 95% CI : (0.512, 0.551)
#> No Information Rate : 0.591
#> P-Value [Acc > NIR] : 1
#>
#> Kappa : 0.051
#>
#> Mcnemar's Test P-Value : 4.91e-05
#>
#> Sensitivity : 0.557
#> Specificity : 0.495
#> Pos Pred Value : 0.615
#> Neg Pred Value : 0.436
#> Prevalence : 0.591
#> Detection Rate : 0.330
#> Detection Prevalence : 0.536
#> Balanced Accuracy : 0.526
#>
#> 'Positive' Class : good
#>
#adding results to summary table
summary_table_down <- cbind(summary_table_down, RF_Bin = summary_table_fun(CM))We see that the 3 best performing models are: Nut, NuBi and NuTo. Which was expected given the importance of the nutritional values that we saw in the PCA during the EDA. Therefore, we decide to test further models only with those 3 combinations of variables, thus leaving out models trained solely on Tot or on Bin.
6.4.6 Logistic Regression - balanced - Nut
Accuracy metrics
- Balanced accuracy 51.0
- Sensitivity 34.0
- Kappa 1.8
6.4.6.1 Fitting
Fitting a logreg model on balanced data with only nutritional variables.
set.seed(12)
logr_Nut_blncd <- caret::train(rating_bin ~.,
data = train_bin_Nut,
method = "glm",
family = "binomial",
trControl = trCtrl_down)
logr_Nut_blncd#> Generalized Linear Model
#>
#> 7623 samples
#> 4 predictor
#> 2 classes: 'good', 'bad'
#>
#> No pre-processing
#> Resampling: Cross-Validated (5 fold)
#> Summary of sample sizes: 6099, 6099, 6098, 6098, 6098
#> Addtional sampling using down-sampling
#>
#> Resampling results:
#>
#> Accuracy Kappa
#> 0.4957 0.04683
We observe that the CV accuracy is 49.6%.
6.4.6.2 Predictions
We obtain a balanced accuracy of 51.0 and a very low sensitivity at 34.0. No overfitting when compared to the CV accuracy.
logr_Nut_pred_blncd <- predict(logr_Nut_blncd, newdata = test_bin_Nut, type = "raw")
CM <- confusionMatrix(reference = test_bin_Nut$rating_bin, data = logr_Nut_pred_blncd, positive="good")
CM#> Confusion Matrix and Statistics
#>
#> Reference
#> Prediction good bad
#> good 510 332
#> bad 992 706
#>
#> Accuracy : 0.479
#> 95% CI : (0.459, 0.498)
#> No Information Rate : 0.591
#> P-Value [Acc > NIR] : 1
#>
#> Kappa : 0.018
#>
#> Mcnemar's Test P-Value : <2e-16
#>
#> Sensitivity : 0.340
#> Specificity : 0.680
#> Pos Pred Value : 0.606
#> Neg Pred Value : 0.416
#> Prevalence : 0.591
#> Detection Rate : 0.201
#> Detection Prevalence : 0.331
#> Balanced Accuracy : 0.510
#>
#> 'Positive' Class : good
#>
summary_table_down <- cbind(summary_table_down, LogReg_Nut = summary_table_fun(CM))6.4.7 Logistic Regression - balanced - NuBi
Accuracy metrics
- Balanced accuracy 54.5
- Sensitivity 55.3
- Kappa 8.8
6.4.7.1 Fitting
Fitting a logreg model on balanced data with NuBi variable combination.
set.seed(12)
logr_NuBi_blncd <- caret::train(rating_bin ~.,
data = train_bin_NuBi,
method = "glm",
family = "binomial",
trControl = trCtrl_down)
logr_NuBi_blncd#> Generalized Linear Model
#>
#> 7623 samples
#> 18 predictor
#> 2 classes: 'good', 'bad'
#>
#> No pre-processing
#> Resampling: Cross-Validated (5 fold)
#> Summary of sample sizes: 6099, 6099, 6098, 6098, 6098
#> Addtional sampling using down-sampling
#>
#> Resampling results:
#>
#> Accuracy Kappa
#> 0.539 0.07101
We observe that the CV accuracy stands at 53.9%.
6.4.7.2 Predictions
Both balanced accuracy at 54.5 and sensitivity are higher than for the model with only Nut variables. Again no overfitting when comparing CV and test accuracy.
logr_NuBi_pred_blncd <- predict(logr_NuBi_blncd, newdata = test_bin_NuBi, type = "raw")
CM <- confusionMatrix(reference = test_bin_NuBi$rating_bin, data = logr_NuBi_pred_blncd, positive="good")
CM#> Confusion Matrix and Statistics
#>
#> Reference
#> Prediction good bad
#> good 830 480
#> bad 672 558
#>
#> Accuracy : 0.546
#> 95% CI : (0.527, 0.566)
#> No Information Rate : 0.591
#> P-Value [Acc > NIR] : 1
#>
#> Kappa : 0.088
#>
#> Mcnemar's Test P-Value : 1.83e-08
#>
#> Sensitivity : 0.553
#> Specificity : 0.538
#> Pos Pred Value : 0.634
#> Neg Pred Value : 0.454
#> Prevalence : 0.591
#> Detection Rate : 0.327
#> Detection Prevalence : 0.516
#> Balanced Accuracy : 0.545
#>
#> 'Positive' Class : good
#>
#adding results to summary table
summary_table_down <- cbind(summary_table_down, LogReg_NuBi = summary_table_fun(CM))6.4.8 Logistic Regression - balanced - NuTo
Accuracy metrics
- Balanced accuracy 55.9
- Sensitivity 57.2
- Kappa 11.4
6.4.8.1 Fitting
Fitting a logreg model on balanced data with NuTo variable combination.
set.seed(12)
logr_NuTo_blncd <- caret::train(rating_bin ~.,
data = train_bin_NuTo,
method = "glm",
family = "binomial",
trControl = trCtrl_down)
logr_NuTo_blncd#> Generalized Linear Model
#>
#> 7623 samples
#> 18 predictor
#> 2 classes: 'good', 'bad'
#>
#> No pre-processing
#> Resampling: Cross-Validated (5 fold)
#> Summary of sample sizes: 6099, 6099, 6098, 6098, 6098
#> Addtional sampling using down-sampling
#>
#> Resampling results:
#>
#> Accuracy Kappa
#> 0.5409 0.07679
We observe that the CV accuracy set stands at 54.1%.
6.4.8.2 Predictions
We want to evaluate the quality of the model and we notice that the balanced accuracy of the test set stands at 55.9%, which is higher than for the NuBi model fitted above. Same for the sensitivity which is now up to 57.2. Once again no overfitting of the model.
logr_NuTo_pred_blncd <- predict(logr_NuTo_blncd, newdata = test_bin_NuTo, type = "raw")
CM <- confusionMatrix(reference = test_bin_NuTo$rating_bin, data = logr_NuTo_pred_blncd, positive="good")
CM#> Confusion Matrix and Statistics
#>
#> Reference
#> Prediction good bad
#> good 859 472
#> bad 643 566
#>
#> Accuracy : 0.561
#> 95% CI : (0.541, 0.58)
#> No Information Rate : 0.591
#> P-Value [Acc > NIR] : 0.999
#>
#> Kappa : 0.114
#>
#> Mcnemar's Test P-Value : 3.56e-07
#>
#> Sensitivity : 0.572
#> Specificity : 0.545
#> Pos Pred Value : 0.645
#> Neg Pred Value : 0.468
#> Prevalence : 0.591
#> Detection Rate : 0.338
#> Detection Prevalence : 0.524
#> Balanced Accuracy : 0.559
#>
#> 'Positive' Class : good
#>
#adding results to summary table
summary_table_down <- cbind(summary_table_down, LogReg_NuTo = summary_table_fun(CM))The best LogReg model on balanced data is the one with the NuTo combination of variables.
6.4.9 SVM - balanced - Nut
6.4.9.1 Linear SVM - balanced - Nut
svm_Linear_nutritional <- train(rating_bin ~ calories + protein + fat + sodium, data=train_bin_Nut, method = "svmLinear", trControl=trCtrl_down)
svm_Linear_nutritionalThe validation accuracy stands at 42.8%, which is not satisfying considering that it is computed on the training set. The next step consists in creating a grid of values for the cost that we want to try and pass to the argument tuneGrid.
6.4.9.1.1 Tuning the Hyperparameters - Linear basis SVM
grid <- expand.grid(C = c(0.01, 0.1, 1, 10, 100, 1000))
grid
svm_Linear_Grid_nutritional <- train(rating_bin ~., data=train_bin_Nut, method = "svmLinear", trControl=trCtrl_down, tuneGrid = grid)
svm_Linear_Grid_nutritional
plot(svm_Linear_Grid_nutritional)
svm_Linear_Grid_nutritional$bestTuneThe result indicates that setting the cost to C=10 provides the best model with accuracy=42.9%. The accuracy apparently reaches a plateau at this value. There is no sensible improvement compared to the previous linear SVM with default parameter cost C=1.
6.4.9.2 Radial SVM - balanced - Nut
svm_Radial_nutritional <- train(rating_bin ~., data=train_bin_Nut, method = "svmRadial", trControl=trCtrl_down)
svm_Radial_nutritionalThe validation accuracy stands at 54.8%, which is substantially better than the SVM model with the linear kernel. The next step consists in creating a grid of values for the cost that we want to try and pass to the argument tuneGrid.
6.4.9.2.1 Tuning the Hyperparameters - Radial basis SVM
grid_radial <- expand.grid(sigma = c(0.01, 0.02, 0.05, 0.1),
C = c(1, 10, 100, 500, 1000))
grid_radial
svm_Radial_Grid_nutritional <- train(rating_bin ~., data=train_bin_Nut, method = "svmRadial", trControl=trCtrl_down, tuneGrid = grid_radial)
svm_Radial_Grid_nutritional
plot(svm_Radial_Grid_nutritional)
svm_Radial_Grid_nutritional$bestTuneThe optimal model from this search is with sigma = 0.1 and C = 1000 This optimal model would then reach accuracy=52.6%.
6.4.9.3 SVM Nut - Radial Kernel Best model
# After finding the best hyperparameters, we re-train the model with the best hyperparameters on the entire training set. Afterwards we will evaluate the model on the test set.
grid_radial_best <- expand.grid(sigma = 0.1, C = 1000)
recipe_rb_tuned_nutritional <- train(rating_bin ~., data=train_bin_Nut, method = "svmRadial", trControl=trCtrl_down, tuneGrid = grid_radial_best)recipe_rb_tuned_pred_nutritional <- predict(recipe_rb_tuned_nutritional, newdata = test_bin_Nut)
CM <- confusionMatrix(data=recipe_rb_tuned_pred_nutritional, reference = test_bin_Nut$rating_bin)
CM#> Confusion Matrix and Statistics
#>
#> Reference
#> Prediction good bad
#> good 646 361
#> bad 856 677
#>
#> Accuracy : 0.521
#> 95% CI : (0.501, 0.54)
#> No Information Rate : 0.591
#> P-Value [Acc > NIR] : 1
#>
#> Kappa : 0.077
#>
#> Mcnemar's Test P-Value : <2e-16
#>
#> Sensitivity : 0.430
#> Specificity : 0.652
#> Pos Pred Value : 0.642
#> Neg Pred Value : 0.442
#> Prevalence : 0.591
#> Detection Rate : 0.254
#> Detection Prevalence : 0.396
#> Balanced Accuracy : 0.541
#>
#> 'Positive' Class : good
#>
#adding results to summary table
summary_table_down <- cbind(summary_table_down, SVM_Radial_Nut = summary_table_fun(CM))The result indicates that with the tuned hyperparameters on the radial basis SVM model we achieve an accuracy of 52.9% on the test set. We can conclude that among all the models, the radial basis kernel SVM with C=1000 and sigma=0.01 is the best model.
6.4.10 SVM - balanced - NuBi
6.4.10.1 Linear SVM - balanced - NuBi
svm_Linear_NuBi <- train(rating_bin ~., data=train_bin_NuBi, method = "svmLinear", trControl=trCtrl_down)
svm_Linear_NuBiThe validation accuracy stands at 56.1%. The next step consists in creating a grid of values for the cost that we want to try and pass to the argument tuneGrid.
6.4.10.1.1 Tuning the Hyperparameters - Linear basis SVM
grid <- expand.grid(C = c(0.01, 0.1, 1, 10, 100, 1000))
grid
svm_Linear_Grid_NuBi <- train(rating_bin ~., data=train_bin_NuBi, method = "svmLinear", trControl=trCtrl_down, tuneGrid = grid)
svm_Linear_Grid_NuBi
plot(svm_Linear_Grid_NuBi)
svm_Linear_Grid_NuBi$bestTuneThe result indicates that setting the cost to C=0.01 provides the best model with accuracy=57.6%. There is a sensible improvement compared to the previous linear SVM with default parameter cost C=1.
6.4.10.2 Radial SVM - balanced - NuBi
svm_Radial_NuBi <- train(rating_bin ~., data=train_bin_NuBi, method = "svmRadial", trControl=trCtrl_down)
svm_Radial_NuBiThe validation accuracy stands at 53.7%, which is worse than the SVM model with the linear kernel. The next step consists in creating a grid of values for the cost that we want to try and pass to the argument tuneGrid.
6.4.10.2.1 Tuning the Hyperparameters - Radial basis SVM
grid_radial <- expand.grid(sigma = c(0.01, 0.02, 0.05, 0.1),
C = c(1, 10, 100, 500, 1000))
grid_radial
svm_Radial_Grid_NuBi <- train(rating_bin ~., data=train_bin_NuBi, method = "svmRadial", trControl=trCtrl_down, tuneGrid = grid_radial)
svm_Radial_Grid_NuBi
plot(svm_Radial_Grid_NuBi)
svm_Radial_Grid_NuBi$bestTuneThe optimal model from this search is with sigma = 0.01 and C = 1. This optimal model would then reach accuracy=54.8%.
6.4.10.3 SVM NuBi - Radial Kernel Best model
# After finding the best hyperparameters, we re-train the model with the best hyperparameters on the entire training set. Afterwards we will evaluate the model on the test set.
grid_radial_best <- expand.grid(sigma = 0.01, C = 1)
recipe_rb_tuned_NuBi <- train(rating_bin ~., data=train_bin_NuBi, method = "svmRadial", trControl=trCtrl_down, tuneGrid = grid_radial_best)recipe_rb_tuned_pred_NuBi <- predict(recipe_rb_tuned_NuBi, newdata = test_bin_NuBi)
CM <- confusionMatrix(data=recipe_rb_tuned_pred_NuBi, reference = test_bin_NuBi$rating_bin)
CM#> Confusion Matrix and Statistics
#>
#> Reference
#> Prediction good bad
#> good 777 492
#> bad 725 546
#>
#> Accuracy : 0.521
#> 95% CI : (0.501, 0.54)
#> No Information Rate : 0.591
#> P-Value [Acc > NIR] : 1
#>
#> Kappa : 0.042
#>
#> Mcnemar's Test P-Value : 2.92e-11
#>
#> Sensitivity : 0.517
#> Specificity : 0.526
#> Pos Pred Value : 0.612
#> Neg Pred Value : 0.430
#> Prevalence : 0.591
#> Detection Rate : 0.306
#> Detection Prevalence : 0.500
#> Balanced Accuracy : 0.522
#>
#> 'Positive' Class : good
#>
#adding results to summary table
summary_table_down <- cbind(summary_table_down, SVM_Radial_NuBi = summary_table_fun(CM))The result indicates that with the tuned hyperparameters on the radial basis SVM model we achieve an accuracy of 54.1% on the test set. We can conclude that among all the models, the radial basis kernel SVM with C=1 and sigma=0.01 is the best model.
6.4.11 SVM - balanced - NuTo
6.4.11.1 Linear SVM - balanced - NuTo
svm_Linear_NuTo <- train(rating_bin ~ calories + protein + fat + sodium, data=train_bin_NuTo, method = "svmLinear", trControl=trCtrl_down)
svm_Linear_NuToThe validation accuracy stands at 42.8%, which is not satisfying considering that it is computed on the training set. The next step consists in creating a grid of values for the cost that we want to try and pass to the argument tuneGrid.
6.4.11.1.1 Tuning the Hyperparameters - Linear basis SVM
grid <- expand.grid(C = c(0.01, 0.1, 1, 10, 100, 1000))
grid
svm_Linear_Grid_NuTo <- train(rating_bin ~., data=train_bin_NuTo, method = "svmLinear", trControl=trCtrl_down, tuneGrid = grid)
svm_Linear_Grid_NuTo
plot(svm_Linear_Grid_NuTo)
svm_Linear_Grid_NuTo$bestTuneThe result indicates that setting the cost to C=1000 provides the best model with accuracy=54%. There is a considerable improvement with respect to the previous linear SVM with default parameter cost C=1.
6.4.11.2 Radial SVM - balanced - NuTo
svm_Radial_NuTo <- train(rating_bin ~., data=train_bin_NuTo, method = "svmRadial", trControl=trCtrl_down)
svm_Radial_NuToThe validation accuracy stands at 53.6%, which is substantially better than the SVM model with the linear kernel and default parameters. The next step consists in creating a grid of values for the cost that we want to try and pass to the argument tuneGrid.
6.4.11.2.1 Tuning the Hyperparameters - Radial basis SVM
grid_radial <- expand.grid(sigma = c(0.01, 0.02, 0.05, 0.1),
C = c(1, 10, 100, 500, 1000))
grid_radial
svm_Radial_Grid_NuTo <- train(rating_bin ~., data=train_bin_NuTo, method = "svmRadial", trControl=trCtrl_down, tuneGrid = grid_radial)
svm_Radial_Grid_NuTo
plot(svm_Radial_Grid_NuTo)
svm_Radial_Grid_NuTo$bestTuneThe optimal model from this search is with sigma = 0.01 and C = 1 This optimal model would then reach accuracy=53.7% .
6.4.11.3 SVM NuTo - Radial Kernel Best model
# After finding the best hyperparameters, we re-train the model with the best hyperparameters on the entire training set. Afterwards we will evaluate the model on the test set.
grid_radial_best <- expand.grid(sigma = 0.01, C = 1)
recipe_rb_tuned_NuTo <- train(rating_bin ~., data=train_bin_NuTo, method = "svmRadial", trControl=trCtrl_down, tuneGrid = grid_radial_best)recipe_rb_tuned_pred_NuTo <- predict(recipe_rb_tuned_NuTo, newdata = test_bin_NuTo)
CM <- confusionMatrix(data=recipe_rb_tuned_pred_NuTo, reference = test_bin_NuTo$rating_bin)
CM#> Confusion Matrix and Statistics
#>
#> Reference
#> Prediction good bad
#> good 849 497
#> bad 653 541
#>
#> Accuracy : 0.547
#> 95% CI : (0.528, 0.567)
#> No Information Rate : 0.591
#> P-Value [Acc > NIR] : 1
#>
#> Kappa : 0.084
#>
#> Mcnemar's Test P-Value : 4.86e-06
#>
#> Sensitivity : 0.565
#> Specificity : 0.521
#> Pos Pred Value : 0.631
#> Neg Pred Value : 0.453
#> Prevalence : 0.591
#> Detection Rate : 0.334
#> Detection Prevalence : 0.530
#> Balanced Accuracy : 0.543
#>
#> 'Positive' Class : good
#>
#adding results to summary table
summary_table_down <- cbind(summary_table_down, SVM_Radial_NuTo = summary_table_fun(CM))The result indicates that with the tuned hyperparameters on the radial basis SVM model we achieve an accuracy of 55.4% on the test set. We can conclude that among all the models, the radial basis kernel SVM with C=1 and sigma=0.01 is the best model.
6.4.12 KNN - balanced - Nut
Hyperparameters
Best K = 146
Accuracy metrics
- Balanced Accuracy 53.6
- Sensitivity 56.1
- Kappa 7.1
6.4.12.1 Fitting
We fit a KNN model on nutritional variables. Best tuned K parameter is at 146.
set.seed(12)
knn_Nut <- caret::train(rating_bin ~.,
data = train_bin_Nut,
method = "knn",
trControl = trCtrl_down,
tuneGrid= data.frame(k = seq(1, 150,by = 5))
)
knn_Nut#> k-Nearest Neighbors
#>
#> 7623 samples
#> 4 predictor
#> 2 classes: 'good', 'bad'
#>
#> No pre-processing
#> Resampling: Cross-Validated (5 fold)
#> Summary of sample sizes: 6099, 6099, 6098, 6098, 6098
#> Addtional sampling using down-sampling
#>
#> Resampling results across tuning parameters:
#>
#> k Accuracy Kappa
#> 1 0.5115 0.02034
#> 6 0.5127 0.02523
#> 11 0.5254 0.04719
#> 16 0.5205 0.03622
#> 21 0.5296 0.05524
#> 26 0.5285 0.05244
#> 31 0.5356 0.06752
#> 36 0.5327 0.05924
#> 41 0.5396 0.07569
#> 46 0.5397 0.07476
#> 51 0.5406 0.07732
#> 56 0.5367 0.07057
#> 61 0.5356 0.06750
#> 66 0.5422 0.08001
#> 71 0.5378 0.07045
#> 76 0.5405 0.07700
#> 81 0.5399 0.07641
#> 86 0.5399 0.07663
#> 91 0.5493 0.09493
#> 96 0.5441 0.08423
#> 101 0.5393 0.07589
#> 106 0.5415 0.07986
#> 111 0.5466 0.08803
#> 116 0.5401 0.07315
#> 121 0.5434 0.07961
#> 126 0.5426 0.07906
#> 131 0.5486 0.09043
#> 136 0.5508 0.09346
#> 141 0.5462 0.08639
#> 146 0.5511 0.09357
#>
#> Accuracy was used to select the optimal model using the
#> largest value.
#> The final value used for the model was k = 146.
6.4.12.2 Predictions
THe balanced accuracy is at 53.6 with a sensitivity of 56.1. No overfitting when looking at CV accuracy and test set accuracy.
knn_Nut_pred <- predict(knn_Nut, newdata = test_bin_Nut, type = "raw")
CM <- confusionMatrix(reference = test_bin_Nut$rating_bin, data = knn_Nut_pred)
CM#> Confusion Matrix and Statistics
#>
#> Reference
#> Prediction good bad
#> good 842 507
#> bad 660 531
#>
#> Accuracy : 0.541
#> 95% CI : (0.521, 0.56)
#> No Information Rate : 0.591
#> P-Value [Acc > NIR] : 1
#>
#> Kappa : 0.071
#>
#> Mcnemar's Test P-Value : 8.61e-06
#>
#> Sensitivity : 0.561
#> Specificity : 0.512
#> Pos Pred Value : 0.624
#> Neg Pred Value : 0.446
#> Prevalence : 0.591
#> Detection Rate : 0.331
#> Detection Prevalence : 0.531
#> Balanced Accuracy : 0.536
#>
#> 'Positive' Class : good
#>
summary_table_down <- cbind(summary_table_down, KNN_Nut = summary_table_fun(CM))6.4.13 KNN - balanced - NuBi
Hyperparameters
Best K = 71
Accuracy metrics
- Balanced Accuracy 51.7
- Sensitivity 51.3
- Kappa 3.4
6.4.13.1 Fitting
We fit a KNN model on balanced data and NuBi variable combination. Best tuned K parameter is at 71
set.seed(12)
knn_NuBi <- caret::train(rating_bin ~.,
data = train_bin_NuBi,
method = "knn",
trControl = trCtrl_down,
tuneGrid= data.frame(k = seq(1, 150,by = 5))
)
knn_NuBi#> k-Nearest Neighbors
#>
#> 7623 samples
#> 18 predictor
#> 2 classes: 'good', 'bad'
#>
#> No pre-processing
#> Resampling: Cross-Validated (5 fold)
#> Summary of sample sizes: 6099, 6099, 6098, 6098, 6098
#> Addtional sampling using down-sampling
#>
#> Resampling results across tuning parameters:
#>
#> k Accuracy Kappa
#> 1 0.5159 0.03302
#> 6 0.5195 0.03762
#> 11 0.5175 0.03123
#> 16 0.5281 0.05469
#> 21 0.5291 0.05654
#> 26 0.5207 0.04282
#> 31 0.5249 0.05307
#> 36 0.5289 0.06206
#> 41 0.5199 0.04767
#> 46 0.5245 0.05251
#> 51 0.5239 0.05521
#> 56 0.5209 0.04855
#> 61 0.5347 0.07109
#> 66 0.5264 0.05810
#> 71 0.5371 0.07523
#> 76 0.5229 0.05468
#> 81 0.5293 0.06244
#> 86 0.5294 0.06422
#> 91 0.5315 0.07016
#> 96 0.5291 0.06254
#> 101 0.5287 0.06560
#> 106 0.5326 0.07371
#> 111 0.5293 0.06799
#> 116 0.5250 0.06135
#> 121 0.5294 0.07004
#> 126 0.5289 0.06792
#> 131 0.5305 0.07441
#> 136 0.5256 0.06340
#> 141 0.5287 0.06842
#> 146 0.5287 0.07384
#>
#> Accuracy was used to select the optimal model using the
#> largest value.
#> The final value used for the model was k = 71.
6.4.13.2 Predictions
Performance of the NuBi KNN model is lower than for the model with only Nut variables. No overfitting detected though.
knn_NuBi_pred <- predict(knn_NuBi, newdata = test_bin_NuBi, type = "raw")
CM <- confusionMatrix(reference = test_bin_NuBi$rating_bin, data = knn_NuBi_pred, positive="good")
CM#> Confusion Matrix and Statistics
#>
#> Reference
#> Prediction good bad
#> good 770 496
#> bad 732 542
#>
#> Accuracy : 0.517
#> 95% CI : (0.497, 0.536)
#> No Information Rate : 0.591
#> P-Value [Acc > NIR] : 1
#>
#> Kappa : 0.034
#>
#> Mcnemar's Test P-Value : 2e-11
#>
#> Sensitivity : 0.513
#> Specificity : 0.522
#> Pos Pred Value : 0.608
#> Neg Pred Value : 0.425
#> Prevalence : 0.591
#> Detection Rate : 0.303
#> Detection Prevalence : 0.498
#> Balanced Accuracy : 0.517
#>
#> 'Positive' Class : good
#>
summary_table_down <- cbind(summary_table_down, KNN_NuBi = summary_table_fun(CM))6.4.14 KNN - balanced - NuTo
Hyperparameters
Best K = 136
Accuracy metrics
- Balanced Accuracy 53.1
- Sensitivity 47.1
- Kappa 5.9
6.4.14.1 Fitting
We fit a KNN model on balanced data and NuTo variable combination. Best tuned K parameter is at 71
set.seed(12)
knn_NuTo <- caret::train(rating_bin ~.,
data = train_bin_NuTo,
method = "knn",
trControl = trCtrl_down,
tuneGrid= data.frame(k = seq(1, 150,by = 5))
)
knn_NuTo#> k-Nearest Neighbors
#>
#> 7623 samples
#> 18 predictor
#> 2 classes: 'good', 'bad'
#>
#> No pre-processing
#> Resampling: Cross-Validated (5 fold)
#> Summary of sample sizes: 6099, 6099, 6098, 6098, 6098
#> Addtional sampling using down-sampling
#>
#> Resampling results across tuning parameters:
#>
#> k Accuracy Kappa
#> 1 0.5246 0.04713
#> 6 0.5209 0.04099
#> 11 0.5235 0.04972
#> 16 0.5249 0.04998
#> 21 0.5292 0.06070
#> 26 0.5268 0.06144
#> 31 0.5310 0.07171
#> 36 0.5250 0.05884
#> 41 0.5243 0.05935
#> 46 0.5241 0.05879
#> 51 0.5205 0.05718
#> 56 0.5287 0.07129
#> 61 0.5216 0.05783
#> 66 0.5224 0.05861
#> 71 0.5263 0.06355
#> 76 0.5277 0.06930
#> 81 0.5259 0.06420
#> 86 0.5297 0.06996
#> 91 0.5259 0.06647
#> 96 0.5310 0.07598
#> 101 0.5317 0.07811
#> 106 0.5293 0.07052
#> 111 0.5262 0.06538
#> 116 0.5272 0.06658
#> 121 0.5321 0.07644
#> 126 0.5301 0.07321
#> 131 0.5273 0.07247
#> 136 0.5371 0.08872
#> 141 0.5298 0.07499
#> 146 0.5327 0.07970
#>
#> Accuracy was used to select the optimal model using the
#> largest value.
#> The final value used for the model was k = 136.
6.4.14.2 Predictions
The NuTo model exhibits better balanced accuracy and sensitivity than the NuBi model tested above. However, it still had a lower balanced accuracy than the Nut KNN model we tested first, and the sensitivity at 47.1 is rather low as well. No overfitting was detected however.
knn_NuTo_pred <- predict(knn_NuTo, newdata = test_bin_NuTo, type = "raw")
CM <- confusionMatrix(reference = test_bin_NuTo$rating_bin, data = knn_NuTo_pred, positive="good")
CM#> Confusion Matrix and Statistics
#>
#> Reference
#> Prediction good bad
#> good 707 424
#> bad 795 614
#>
#> Accuracy : 0.52
#> 95% CI : (0.5, 0.54)
#> No Information Rate : 0.591
#> P-Value [Acc > NIR] : 1
#>
#> Kappa : 0.059
#>
#> Mcnemar's Test P-Value : <2e-16
#>
#> Sensitivity : 0.471
#> Specificity : 0.592
#> Pos Pred Value : 0.625
#> Neg Pred Value : 0.436
#> Prevalence : 0.591
#> Detection Rate : 0.278
#> Detection Prevalence : 0.445
#> Balanced Accuracy : 0.531
#>
#> 'Positive' Class : good
#>
summary_table_down <- cbind(summary_table_down, KNN_NuTo = summary_table_fun(CM))Overall, the best KNN model is the one fitted only on Nutritional variables.
6.4.15 CART - balanced - Nut
Hyperparameters
Best parameter CP is 0.0035
Accuracy metrics
- Balanced Accuracy 53.4
- Sensitivity 60.9
- Kappa 6.8
6.4.15.1 Fitting
Fitting a CART model with only nutritional variables on balanced data.
#CART_grid <- expand.grid(cp = c(0.001,0.005,0.01, 0.015, 0.02, 0.05, 0.1, 0.15, 0.2)) --> confirmed that highest kappa is at cp selected by caret
set.seed(12)
cart_Nut_fit <- train(rating_bin ~ .,
data = train_bin_Nut,
method = "rpart",
trControl = trCtrl_down,
tuneLength = 10)
cart_Nut_fit#> CART
#>
#> 7623 samples
#> 4 predictor
#> 2 classes: 'good', 'bad'
#>
#> No pre-processing
#> Resampling: Cross-Validated (5 fold)
#> Summary of sample sizes: 6099, 6099, 6098, 6098, 6098
#> Addtional sampling using down-sampling
#>
#> Resampling results across tuning parameters:
#>
#> cp Accuracy Kappa
#> 0.001284 0.5270 0.05414
#> 0.001445 0.5289 0.05842
#> 0.001605 0.5321 0.06505
#> 0.001734 0.5350 0.06739
#> 0.001819 0.5330 0.06865
#> 0.001926 0.5351 0.06992
#> 0.002033 0.5407 0.06416
#> 0.002568 0.5428 0.07914
#> 0.002729 0.5432 0.07873
#> 0.003050 0.5477 0.08129
#>
#> Accuracy was used to select the optimal model using the
#> largest value.
#> The final value used for the model was cp = 0.00305.
It might seem like the CP chosen by caret is not optimal since the accuracy curve continues to go up, however, we confirmed by using a manual tuning grid that this CP was indeed the best when combining the accuracy with the Kappa metric.
# Get the results for each CP step
cart_Nut_results <- cart_Nut_fit$results
cart_Nut_results %>%
ggplot(aes(x=cp, y=Accuracy)) +
geom_line()+
labs(x = "CP Parameter", y = "Accuracy", title = "CP Plot")6.4.15.2 Predictions
#Now using the cv to predict test set
cart_Nut_pred <- predict(cart_Nut_fit, newdata = test_bin_Nut)
#And looking at the confusion matrix for the predictions using the cv
CM <- confusionMatrix(reference = test_bin_Nut$rating_bin, data = cart_Nut_pred)
CM#> Confusion Matrix and Statistics
#>
#> Reference
#> Prediction good bad
#> good 914 561
#> bad 588 477
#>
#> Accuracy : 0.548
#> 95% CI : (0.528, 0.567)
#> No Information Rate : 0.591
#> P-Value [Acc > NIR] : 1.000
#>
#> Kappa : 0.068
#>
#> Mcnemar's Test P-Value : 0.443
#>
#> Sensitivity : 0.609
#> Specificity : 0.460
#> Pos Pred Value : 0.620
#> Neg Pred Value : 0.448
#> Prevalence : 0.591
#> Detection Rate : 0.360
#> Detection Prevalence : 0.581
#> Balanced Accuracy : 0.534
#>
#> 'Positive' Class : good
#>
summary_table_down <- cbind(summary_table_down, CART_Nut = summary_table_fun(CM))6.4.16 CART - balanced - NuBi
Hyperparameters
Best parameter CP is 0.0055
Accuracy metrics
- Balanced Accuracy 53.5
- Sensitivity 54.5
- Kappa 6.8
6.4.16.1 Fitting
Fitting a CART model with NuBi variable combination on balanced data.
#CART_grid <- expand.grid(cp = c(0.001,0.005,0.01, 0.015, 0.02, 0.05, 0.1, 0.15, 0.2)) --> confirmed that highest kappa is at cp selected by caret
set.seed(12)
cart_NuBi_fit <- train(rating_bin ~ .,
data = train_bin_NuBi,
method = "rpart",
trControl = trCtrl_down,
tuneLength = 10)
cart_NuBi_fit#> CART
#>
#> 7623 samples
#> 18 predictor
#> 2 classes: 'good', 'bad'
#>
#> No pre-processing
#> Resampling: Cross-Validated (5 fold)
#> Summary of sample sizes: 6099, 6099, 6098, 6098, 6098
#> Addtional sampling using down-sampling
#>
#> Resampling results across tuning parameters:
#>
#> cp Accuracy Kappa
#> 0.001284 0.5306 0.06280
#> 0.001364 0.5338 0.06578
#> 0.001445 0.5292 0.06085
#> 0.001498 0.5367 0.06777
#> 0.001846 0.5348 0.06562
#> 0.002354 0.5390 0.06488
#> 0.002889 0.5340 0.06357
#> 0.003210 0.5318 0.06404
#> 0.005457 0.5536 0.08693
#> 0.007223 0.5478 0.07620
#>
#> Accuracy was used to select the optimal model using the
#> largest value.
#> The final value used for the model was cp = 0.005457.
Best cp of 0.0055 for this model.
# Get the results for each CP step
cart_NuBi_results <- cart_NuBi_fit$results
cart_NuBi_results %>%
ggplot(aes(x=cp, y=Accuracy)) +
geom_line()+
labs(x = "CP Parameter", y = "Accuracy", title = "CP Plot")6.4.16.2 Predictions
Balanced accuracy at 53.5 is slightly higher than for the Nut model, despite a significantly lower sensitivity. No overfitting detected when comparing CV and test accuracy metrics.
#Now using the cv to predict test set
cart_NuBi_pred <- predict(cart_NuBi_fit, newdata = test_bin_NuBi)
#And looking at the confusion matrix for the predictions using the cv
CM <- confusionMatrix(reference = test_bin_Nut$rating_bin, data = cart_NuBi_pred)
CM#> Confusion Matrix and Statistics
#>
#> Reference
#> Prediction good bad
#> good 818 493
#> bad 684 545
#>
#> Accuracy : 0.537
#> 95% CI : (0.517, 0.556)
#> No Information Rate : 0.591
#> P-Value [Acc > NIR] : 1
#>
#> Kappa : 0.068
#>
#> Mcnemar's Test P-Value : 3.06e-08
#>
#> Sensitivity : 0.545
#> Specificity : 0.525
#> Pos Pred Value : 0.624
#> Neg Pred Value : 0.443
#> Prevalence : 0.591
#> Detection Rate : 0.322
#> Detection Prevalence : 0.516
#> Balanced Accuracy : 0.535
#>
#> 'Positive' Class : good
#>
summary_table_down <- cbind(summary_table_down, CART_NuBi = summary_table_fun(CM))6.4.17 CART - balanced - NuTo
Hyperparameters
Best parameter CP is 0.00257
Accuracy metrics
- Balanced Accuracy 54.0
- Sensitivity 52.3
- Kappa 7.7
6.4.17.1 Fitting
Fitting a CART model with NuTo variable combination on balanced data.
#CART_grid <- expand.grid(cp = c(0.001,0.005,0.01, 0.015, 0.02, 0.05, 0.1, 0.15, 0.2)) --> confirmed that highest kappa is at cp selected by caret
set.seed(12)
cart_NuTo_fit <- train(rating_bin ~ .,
data = train_bin_NuTo,
method = "rpart",
trControl = trCtrl_down,
tuneLength = 10)
cart_NuTo_fit#> CART
#>
#> 7623 samples
#> 18 predictor
#> 2 classes: 'good', 'bad'
#>
#> No pre-processing
#> Resampling: Cross-Validated (5 fold)
#> Summary of sample sizes: 6099, 6099, 6098, 6098, 6098
#> Addtional sampling using down-sampling
#>
#> Resampling results across tuning parameters:
#>
#> cp Accuracy Kappa
#> 0.001445 0.5158 0.03192
#> 0.001498 0.5218 0.03905
#> 0.001605 0.5237 0.04427
#> 0.001766 0.5233 0.04411
#> 0.001926 0.5263 0.04881
#> 0.002247 0.5338 0.06240
#> 0.002408 0.5352 0.06382
#> 0.002568 0.5352 0.06382
#> 0.003050 0.5344 0.06431
#> 0.004173 0.5350 0.06183
#>
#> Accuracy was used to select the optimal model using the
#> largest value.
#> The final value used for the model was cp = 0.002568.
CP quickly reaches a plateau, with best parameter at 0.0026.
# Get the results for each CP step
cart_NuTo_results <- cart_NuTo_fit$results
cart_NuTo_results %>%
ggplot(aes(x=cp, y=Accuracy)) +
geom_line()+
labs(x = "CP Parameter", y = "Accuracy", title = "CP Plot")6.4.17.2 Predictions
THe NuTo CART model reaches the highest balanced accuracy of the 3 CART models on balanced data, with a value of 54%. No overfitting detected.
#Now using the cv to predict test set
cart_NuTo_pred <- predict(cart_NuTo_fit, newdata = test_bin_NuTo)
#And looking at the confusion matrix for the predictions using the cv
CM <- confusionMatrix(reference = test_bin_Nut$rating_bin, data = cart_NuTo_pred)
CM#> Confusion Matrix and Statistics
#>
#> Reference
#> Prediction good bad
#> good 785 459
#> bad 717 579
#>
#> Accuracy : 0.537
#> 95% CI : (0.517, 0.557)
#> No Information Rate : 0.591
#> P-Value [Acc > NIR] : 1
#>
#> Kappa : 0.077
#>
#> Mcnemar's Test P-Value : 6.67e-14
#>
#> Sensitivity : 0.523
#> Specificity : 0.558
#> Pos Pred Value : 0.631
#> Neg Pred Value : 0.447
#> Prevalence : 0.591
#> Detection Rate : 0.309
#> Detection Prevalence : 0.490
#> Balanced Accuracy : 0.540
#>
#> 'Positive' Class : good
#>
summary_table_down <- cbind(summary_table_down, CART_NuTo = summary_table_fun(CM))The NuTo CART model is the best of the 3 CART models in terms of balanced accuracy, but has the lowest sensitivity of the 3.
6.5 All variables (Nut + 288 ingredients)
6.5.1 Creating dataset
We first create a new dataset containing the 4 nutritional variables and the 288 original binary ingredient variables (was 293 but we remove 5 below which were present in none of the recipes).
analysis_all <- recipes %>%
select(rating, all_of(c(nutritional_values, all_ingredients_new))) %>%
mutate(rating_bin = as.factor(ifelse(rating<4, "bad", "good")), across(all_of(all_ingredients_new), as.factor)) %>%
select(-rating) %>%
select(rating_bin, everything())
#reversing rating_bin factor order
analysis_all$rating_bin <- factor(analysis_all$rating_bin, levels=rev(levels(analysis_all$rating_bin)))
#normalising newly created df
analysis_all <- analysis_all %>%
mutate(across(where(is.numeric), my_normalise))We create a training set with the new rating_bin
variable with only 2 levels, including the nutritional and all
ingredients dummies, instead of the engineered variables used in the
sub-section above.
#creating training and test sets
set.seed(12)
index_all <- createDataPartition(analysis_all$rating_bin, p=0.75, list = FALSE)
#creating training and test set with binary rating variable on all variables
train_all <- analysis_all[index_all, ]
test_all <- analysis_all[-index_all, ]
table(train_all$rating_bin)#>
#> good bad
#> 4508 3115
6.5.1.1 Balancing the binary training set with all variables through manual downsampling
#filtering by rating class
set.seed(12)
tr_good <- train_all %>% filter(rating_bin == "good")
tr_bad <- train_all %>% filter(rating_bin == "bad")
#indexing "bad" and creating new resampled training set
index_good <- sample(x = 1:nrow(tr_good), size = nrow(tr_bad), replace = FALSE)
downsamp_tr_all <- tibble(rbind(tr_good[index_good,], tr_bad))
#checking that we have the correct number of good and bad
table(downsamp_tr_all$rating_bin)#>
#> good bad
#> 3115 3115
6.5.2 Random Forest - balanced - 4 Nut + 288 variables
Hyperparameters
Best tune: mtry = 147
Accuracy metrics With tuneLength = 3
- Balanced Accuracy 56.1
- Sensitivity 55.5
- Kappa 11.8
6.5.2.1 Fitting
We fit a RF model on the 292 variables using a tuneLength of 3 for the Mtry parameter to avoid significant computation time with higher tuning lengths.
set.seed(12)
rf_all_fit <- train(rating_bin ~ .,
data = train_all,
method = "rf",
trControl = trCtrl_down,
tuneLength = 3)
#show the random forest with cv
rf_all_fit#> Random Forest
#>
#> 7623 samples
#> 292 predictor
#> 2 classes: 'good', 'bad'
#>
#> No pre-processing
#> Resampling: Cross-Validated (5 fold)
#> Summary of sample sizes: 6099, 6099, 6098, 6098, 6098
#> Addtional sampling using down-sampling
#>
#> Resampling results across tuning parameters:
#>
#> mtry Accuracy Kappa
#> 2 0.5289 0.09973
#> 147 0.5611 0.12227
#> 292 0.5578 0.11194
#>
#> Accuracy was used to select the optimal model using the
#> largest value.
#> The final value used for the model was mtry = 147.
plot(rf_all_fit, main = "Plot of CV Accuracy relative to Mtry parameter")6.5.2.2 Predictions
The model achieves a very high balanced accuracy relative to the other models tested so far, at 56.1%. The specificity is also relatively high at 55.5%, but we have seen better in models we have tuned so far. Overall the performance of the RF_all model is still more than decent. We also checked that there was not overfitting by comparing the CV accuracy at mtry 147 with the test accuracy, both being at around 56%.
set.seed(12)
#make predictions
rf_all_pred <- predict(rf_all_fit, newdata = test_all)
#confusion matrix
CM <- confusionMatrix(data = rf_all_pred, reference = test_all$rating_bin)
CM#> Confusion Matrix and Statistics
#>
#> Reference
#> Prediction good bad
#> good 834 450
#> bad 668 588
#>
#> Accuracy : 0.56
#> 95% CI : (0.54, 0.579)
#> No Information Rate : 0.591
#> P-Value [Acc > NIR] : 0.999
#>
#> Kappa : 0.118
#>
#> Mcnemar's Test P-Value : 8.59e-11
#>
#> Sensitivity : 0.555
#> Specificity : 0.566
#> Pos Pred Value : 0.650
#> Neg Pred Value : 0.468
#> Prevalence : 0.591
#> Detection Rate : 0.328
#> Detection Prevalence : 0.506
#> Balanced Accuracy : 0.561
#>
#> 'Positive' Class : good
#>
#adding results to summary table
summary_table_down <- cbind(summary_table_down, RF_all = summary_table_fun(CM))6.5.3 Logistic Regression - balanced - 4 Nut + 288 variables
Accuracy metrics
- Balanced Accuracy 55.8
- Sensitivity 55.4
- Kappa 11.3
6.5.3.1 Fitting
We now fit a logreg model on the 4 nut variables and the 288 ingredients variables.
#too long
logreg_all_fit <- train(rating_bin~.,
data=downsamp_tr_all,
method="glm",
family = "binomial",
trControl=trCtrl_down,
trace = FALSE)
logreg_all_fit#> Generalized Linear Model
#>
#> 6230 samples
#> 292 predictor
#> 2 classes: 'good', 'bad'
#>
#> No pre-processing
#> Resampling: Cross-Validated (5 fold)
#> Summary of sample sizes: 4984, 4984, 4984, 4984, 4984
#> Addtional sampling using down-sampling
#>
#> Resampling results:
#>
#> Accuracy Kappa
#> 0.5568 0.1136
6.5.3.2 Predictions
We find a balanced accuarcy which is slightly lower than the one with the RF_all model, but still relatively higiher. The accuarcy of the LogReg_all model is also very slightly than that the one of the RF_all. Again no overfitting detected for this model which is good.
#predicting
logreg_all_pred <- predict(logreg_all_fit, newdata = test_all)
#creating confusion matrix
CM <- confusionMatrix(reference = test_all$rating_bin, data = logreg_all_pred)
CM#> Confusion Matrix and Statistics
#>
#> Reference
#> Prediction good bad
#> good 832 454
#> bad 670 584
#>
#> Accuracy : 0.557
#> 95% CI : (0.538, 0.577)
#> No Information Rate : 0.591
#> P-Value [Acc > NIR] : 1
#>
#> Kappa : 0.113
#>
#> Mcnemar's Test P-Value : 1.43e-10
#>
#> Sensitivity : 0.554
#> Specificity : 0.563
#> Pos Pred Value : 0.647
#> Neg Pred Value : 0.466
#> Prevalence : 0.591
#> Detection Rate : 0.328
#> Detection Prevalence : 0.506
#> Balanced Accuracy : 0.558
#>
#> 'Positive' Class : good
#>
#adding results to the summary_table_unbal
summary_table_down <- cbind(summary_table_down, LogReg_all = summary_table_fun(CM))6.5.4 CART - balanced - 4 Nut + 288 variables
Hyperparameters
Best parameter CP is 0.0036
Accuracy metrics
- Balanced Accuracy 53.5
- Sensitivity 52.7
- Kappa 6.8
6.5.4.1 Fitting
#CART_grid <- expand.grid(cp = c(0.001,0.005,0.01, 0.015, 0.02, 0.05, 0.1, 0.15, 0.2)) --> confirmed that highest kappa is at cp selected by caret
set.seed(12)
cart_all_fit <- train(rating_bin ~ .,
data = train_all,
method = "rpart",
trControl = trCtrl_down,
tuneLength = 10)
cart_all_fit#> CART
#>
#> 7623 samples
#> 292 predictor
#> 2 classes: 'good', 'bad'
#>
#> No pre-processing
#> Resampling: Cross-Validated (5 fold)
#> Summary of sample sizes: 6099, 6099, 6098, 6098, 6098
#> Addtional sampling using down-sampling
#>
#> Resampling results across tuning parameters:
#>
#> cp Accuracy Kappa
#> 0.0009172 0.5426 0.06545
#> 0.0009631 0.5410 0.06406
#> 0.0010166 0.5414 0.06589
#> 0.0010701 0.5397 0.06309
#> 0.0012841 0.5468 0.07655
#> 0.0014446 0.5393 0.07740
#> 0.0016051 0.5407 0.07690
#> 0.0018192 0.5352 0.07239
#> 0.0026485 0.5430 0.07834
#> 0.0036116 0.5475 0.08248
#>
#> Accuracy was used to select the optimal model using the
#> largest value.
#> The final value used for the model was cp = 0.003612.
Once again the CP parameter reaches a plateau, even though it might look like the accuracy will keep climbing, it is not the case.
# Get the results for each CP step
cart_all_results <- cart_all_fit$results
cart_all_results %>%
ggplot(aes(x=cp, y=Accuracy)) +
geom_line()+
labs(x = "CP Parameter", y = "Accuracy", title = "CP Plot")6.5.4.2 Predictions
The results for the CART model with all variables are rather disappointing. It had a pretty low balanced accuracy and specificity, similar to the CART models trained on the engineered variables and nutritional values combinations. It can’t compete with the LogReg_all or RF_all model tested above.
#Now using the cv to predict test set
cart_all_pred <- predict(cart_all_fit, newdata = test_all)
#And looking at the confusion matrix for the predictions using the cv
CM <- confusionMatrix(reference = test_bin_Nut$rating_bin, data = cart_all_pred)
CM#> Confusion Matrix and Statistics
#>
#> Reference
#> Prediction good bad
#> good 792 474
#> bad 710 564
#>
#> Accuracy : 0.534
#> 95% CI : (0.514, 0.553)
#> No Information Rate : 0.591
#> P-Value [Acc > NIR] : 1
#>
#> Kappa : 0.068
#>
#> Mcnemar's Test P-Value : 8.52e-12
#>
#> Sensitivity : 0.527
#> Specificity : 0.543
#> Pos Pred Value : 0.626
#> Neg Pred Value : 0.443
#> Prevalence : 0.591
#> Detection Rate : 0.312
#> Detection Prevalence : 0.498
#> Balanced Accuracy : 0.535
#>
#> 'Positive' Class : good
#>
summary_table_down <- cbind(summary_table_down, CART_all = summary_table_fun(CM))6.5.5 XGBoost - balanced - 4 Nut + 288 variables
Here we decided to try the XGBoost model for the first time as a bonus test to see if it would magically work better than the others we have tested so far with other variable combinations.
#creating datasets for xgboost
numeric_downsamp_tr <- downsamp_tr_all %>%
mutate(rating_bin = ifelse(rating_bin == "good", 1, 0), across(everything(), as.character)) %>%
mutate(across(everything(), as.numeric))
numeric_te <- test_all %>%
mutate(rating_bin = ifelse(rating_bin == "good", 1, 0), across(everything(), as.character)) %>%
mutate(across(everything(), as.numeric))
#creating datasets which don't include the labels, for inputs in the xgb Matrix
explan_downsamp_tr <- numeric_downsamp_tr %>%
select(-rating_bin)
explan_te <- numeric_te %>%
select(-rating_bin)
#prepare data for XGBoost by creating xgb matrix
xgb_downsamp_tr <- xgb.DMatrix(data = as.matrix(explan_downsamp_tr), label = numeric_downsamp_tr$rating_bin)
xgb_te <- xgb.DMatrix(data = as.matrix(explan_te), label = numeric_te$rating_bin)6.5.5.1 Fitting
Accuracy metrics
- Balanced Accuracy 56.0
- Sensitivity 59.5
- Kappa 11.8
We fit the XGBoost model after manual tuning of the hyperparameters.
#set hyperparameters
param_list <- list(
objective = "binary:logistic",
eta = 0.1,
gamma = 7,
max_depth = 5,
min_child_weight = 1,
subsample = 0.6,
colsample_bytree = 0.6
)
#train the model
set.seed(12)
xgb_down_fit <- xgboost(data = xgb_downsamp_tr,
params = param_list,
nrounds = 100,
verbose = FALSE)Here we control for overfitting by checking the balanced accuracy on the training set, which stands at a 61.3%. We have found a great balance between performance and overfitting by setting the gamma parameter at 7 and the nrounds at 100.
#testing on training set to control overfitting
xgb_down_prob_train <- predict(xgb_down_fit, newdata = xgb_downsamp_tr)
xgb_down_pred_train <- as.factor(ifelse(xgb_down_prob_train<0.5, 0, 1))
CM <- confusionMatrix(reference = as.factor(numeric_downsamp_tr$rating_bin), data = xgb_down_pred_train, positive = "1")
CM#> Confusion Matrix and Statistics
#>
#> Reference
#> Prediction 0 1
#> 0 1839 1133
#> 1 1276 1982
#>
#> Accuracy : 0.613
#> 95% CI : (0.601, 0.625)
#> No Information Rate : 0.5
#> P-Value [Acc > NIR] : < 2e-16
#>
#> Kappa : 0.227
#>
#> Mcnemar's Test P-Value : 0.00381
#>
#> Sensitivity : 0.636
#> Specificity : 0.590
#> Pos Pred Value : 0.608
#> Neg Pred Value : 0.619
#> Prevalence : 0.500
#> Detection Rate : 0.318
#> Detection Prevalence : 0.523
#> Balanced Accuracy : 0.613
#>
#> 'Positive' Class : 1
#>
6.5.5.2 Predictions
We now check the actual balanced accuracy on the testing set. We see that the accuracy is at 56.0%. While this is slightly lower than what we found on the training set, it does not signal significant overfitting.
In terms of other metrics, we achieve a relatively high balanced accuracy at 56.0 which is just slightly lower than the BACC found with the RF_all model. We also have a high sensitivity at 59.5. This model looks like a great candidate to be one of the best models overall.
xgb_down_prob <- predict(xgb_down_fit, newdata = xgb_te)
xgb_down_pred <- as.factor(ifelse(xgb_down_prob<0.5, 0, 1))
CM <- confusionMatrix(reference = as.factor(numeric_te$rating_bin), data = xgb_down_pred, positive = "1")
CM#> Confusion Matrix and Statistics
#>
#> Reference
#> Prediction 0 1
#> 0 546 609
#> 1 492 893
#>
#> Accuracy : 0.567
#> 95% CI : (0.547, 0.586)
#> No Information Rate : 0.591
#> P-Value [Acc > NIR] : 0.994717
#>
#> Kappa : 0.118
#>
#> Mcnemar's Test P-Value : 0.000472
#>
#> Sensitivity : 0.595
#> Specificity : 0.526
#> Pos Pred Value : 0.645
#> Neg Pred Value : 0.473
#> Prevalence : 0.591
#> Detection Rate : 0.352
#> Detection Prevalence : 0.545
#> Balanced Accuracy : 0.560
#>
#> 'Positive' Class : 1
#>
#adding results to summary table
summary_table_down <- cbind(summary_table_down, XGB_all = summary_table_fun(CM))6.6 Model Selection
We decide to select models only within the models that we fitted with the binary rating variable. This leaves us with both the balanced and unbalanced sections with the binary rating.
#creating new df formats for model selection analysis plots
summary_down_long <- summary_table_down %>%
filter(metric != "accuracy") %>%
pivot_longer(-metric, names_to = "model", values_to = "value") %>%
select(model, everything()) %>%
mutate(value = value*100) %>%
arrange(model)
summary_down_wide <- summary_down_long %>%
pivot_wider(names_from = "metric", values_from = "value")6.6.1 Unbalanced data
Here we can see the results for section 5.3 on the unbalanced data. We observe that the balanced accuracy is rarely too far from 50% which indicates a pretty bad classification. The sensitivity is also very high for most models due to the bias towards the majority class “good” due to the class imbalance. Given this poor performance, we decide to focus on the models with balanced data to perform our selection of the best model.
#displaying table without accuracy metric since we do not use it in the model selection.
summary_table_unbal %>%
filter(metric != "accuracy") %>%
kbl() %>%
kable_styling(position = "center")| metric | LogReg_Nut | LogReg_NuBi | LogReg_NuTo | KNN_Nut | KNN_NuBi | KNN_NuTo | CART_Nut | CART_NuBi | CART_NuTo |
|---|---|---|---|---|---|---|---|---|---|
| balanced_accuracy | 0.5 | 0.5081 | 0.5126 | 0.5251 | 0.5130 | 0.5182 | 0.5 | 0.5157 | 0.5 |
| sensitivity | 1.0 | 0.9314 | 0.9328 | 0.9201 | 0.9008 | 0.9121 | 1.0 | 0.9168 | 1.0 |
| kappa | 0.0 | 0.0185 | 0.0289 | 0.0568 | 0.0294 | 0.0412 | 0.0 | 0.0356 | 0.0 |
6.6.2 Balanced data
Here we can see the results for section 5.4 on the downsampled data, with a total of 21 models that we fitted with various combinations of explanatory variables. We see that the balanced accuracy is already higher, and the sensitivity is more reasonable. Below, we investigate various aspects of these metrics and select the top-performing models. We decide to use balanced accuracy and sensitivity as the two core metrics to quantitatively rank the best models.
#displaying table without accuracy metric since we do not use it in the model selection.
summary_down_wide %>%
kbl() %>%
kable_styling(position = "center")| model | balanced_accuracy | sensitivity | kappa |
|---|---|---|---|
| CART_NuBi | 53.48 | 54.46 | 6.773 |
| CART_NuTo | 54.02 | 52.26 | 7.747 |
| CART_Nut | 53.40 | 60.85 | 6.779 |
| CART_all | 53.53 | 52.73 | 6.825 |
| KNN_NuBi | 51.74 | 51.27 | 3.363 |
| KNN_NuTo | 53.11 | 47.07 | 5.897 |
| KNN_Nut | 53.61 | 56.06 | 7.054 |
| LogReg_NuBi | 54.51 | 55.26 | 8.766 |
| LogReg_NuTo | 55.86 | 57.19 | 11.428 |
| LogReg_Nut | 50.99 | 33.95 | 1.794 |
| LogReg_all | 55.83 | 55.39 | 11.292 |
| RF_Bin | 52.62 | 55.73 | 5.136 |
| RF_NuBi | 54.15 | 57.72 | 8.153 |
| RF_NuTo | 55.64 | 59.05 | 11.074 |
| RF_Nut | 53.71 | 53.66 | 7.188 |
| RF_Tot | 52.44 | 52.86 | 4.738 |
| RF_all | 56.09 | 55.53 | 11.791 |
| SVM_Radial_NuBi | 52.17 | 51.73 | 4.187 |
| SVM_Radial_NuTo | 54.32 | 56.52 | 8.448 |
| SVM_Radial_Nut | 54.12 | 43.01 | 7.666 |
| XGB_all | 56.03 | 59.45 | 11.849 |
summary_table_down_store <- summary_table_down#selecting 3 models with highest balance accuracy
bacc_select <- summary_down_wide %>%
arrange(desc(balanced_accuracy)) %>%
dplyr::slice(1:5) %>% pull(model)
#selecting 3 models with highest sens
sens_select <- summary_down_wide %>%
arrange(desc(sensitivity)) %>%
dplyr::slice(1:5) %>% pull(model)
#selecting 3 models with highest kappa
kappa_select <- summary_down_wide %>%
arrange(desc(kappa)) %>%
dplyr::slice(1:5) %>% pull(model)We first look at the top-5 models with the highest balanced_accuracy
mean_bacc <- summary_down_wide %>%
summarise(mean_bacc = mean(balanced_accuracy)) %>% pull() %>% round(., 2)
summary_down_long %>%
filter(model %in% bacc_select, metric=="balanced_accuracy") %>%
ggplot(aes(x=model, y=value, fill = model))+
geom_bar(stat="identity", position = "dodge")+
coord_cartesian(ylim = c(50, 60)) +
labs(y = "Balance accuracy (in %)", x = "Model", title = "Top 5 models with highest balanced accuracy")We now plot the top-5 models with the highest sensitivity.
mean_sens<- summary_down_wide %>%
summarise(mean_sens = mean(sensitivity)) %>% pull() %>% round(., 2)
summary_down_long %>%
filter(model %in% sens_select, metric=="sensitivity") %>%
ggplot(aes(x=model, y=value, fill = model))+
geom_bar(stat="identity", position = "dodge")+
coord_cartesian(ylim = c(55, 65)) +
labs(y = "Sensitivity (in %)", x = "Model", title = "Top 5 models with highest sensitivity")We then create a ranking system with points based on the position of each model regarding the two core metrics, balanced accuracy and sensitivity.
summary_points <- summary_down_wide %>%
mutate(rank_bacc = 22-rank(balanced_accuracy), points_bacc = rank(balanced_accuracy), rank_sens = 22-rank(sensitivity), points_sens = rank(sensitivity), total_points = (points_bacc + points_sens))
summary_points %>%
arrange(desc(total_points)) %>%
dplyr::slice(1:5) %>%
select(model, points_bacc, points_sens, total_points) %>%
kbl() %>%
kable_styling(position = "center")| model | points_bacc | points_sens | total_points |
|---|---|---|---|
| XGB_all | 20 | 20 | 40 |
| LogReg_NuTo | 19 | 17 | 36 |
| RF_NuTo | 17 | 19 | 36 |
| RF_all | 21 | 13 | 34 |
| RF_NuBi | 14 | 18 | 32 |
summary_points %>%
arrange(desc(total_points)) %>%
dplyr::slice(1:5) %>%
ggplot(aes(x=model, y=total_points, fill = model))+
geom_bar(stat="identity", position = "dodge")+
coord_cartesian(ylim = c(30, 40)) +
scale_y_continuous(breaks = seq(30, 40, by = 1))+
labs(y = "Total number of points", x = "Model", title = "Top 5 models with highest number of points")
Here we can see that XGB_all has the highest number of points based on
the weighted ranking system combining the balanced accuracy and
sensitivity metrics with a total of 40 points. Followed by the RF_NuTo
and LogReg_NuTo sharing the second place with 36 points.
6.6.3 Showcasing the best model - XGBoost_all
Here we show 4 metrics for the best model we select. We can see that the balanced accuracy is at 56%. The sensitivity is the highest at around 59.5%. Kappa is at 11.85% which is pretty low but still in the top values compared to other models we compared.
summary_down_long %>%
filter(model =="XGB_all") %>%
ggplot(aes(x=model, y=value, fill = metric))+
geom_bar(stat="identity", position = "dodge")+
coord_cartesian(ylim = c(10, 60)) +
labs(y = "Value (in %)", x = "", title = "Showcasing 4 metrics for the XGB_all model")+
scale_fill_viridis(discrete = TRUE)6.6.3.1 Prediction robustness and plotting ROC curve of XGBoost_all
We can clearly see even the best model has a lot of trouble clearly classifying the results by attributing probabilities to each class. There is some severe overlap in probabilites around the threshold at 0.5. The median probability for each class is barely above and below the threshold for “good” and “bad” respectively. This indicates very low robustness of the model and confirms the poor classification performance.
#predicting on the testing set
roc_df <- test_all %>%
mutate(prob_test = predict(xgb_down_fit, newdata = xgb_te))
#plotting histogram
roc_df %>%
ggplot(aes(x = prob_test, fill = rating_bin)) +
geom_histogram(bins = 20) +
labs(x = "Probability", y = "Count", title = "Distribution of predicted probabilities on the test set - XGB_all")#plotting boxplot
roc_df %>%
ggplot(aes(x = rating_bin, y = prob_test)) +
geom_boxplot() +
labs(x = "Response", y = "Probability", title = "Boxplot of predicted probabilities on the test - XGB_all")
Here we can see the ROC curve relating to the test set probabilities of
the XGB_all model. This shows us that is is better than the random
model, but not great peformance either as the AUC is clearly quite
small.
rocit_emp <- rocit(score = xgb_down_prob, class = numeric_te$rating_bin, method = "emp")
plot(rocit_emp, col = c(1,"gray50"), legend = TRUE, YIndex = TRUE)6.7 Variable Importance
Below we show all the variables from most to least important from the top model we selected, XGB_all.
#XGBoost variable importance
xgb_importance_matrix <- xgb.importance(model = xgb_down_fit)
xgb_importance_matrix#> Feature Gain Cover Frequency
#> 1: sodium 0.1179688 0.1055961 0.111111
#> 2: fat 0.0997009 0.0553565 0.093750
#> 3: calories 0.0965391 0.0858199 0.100694
#> 4: protein 0.0847009 0.0624822 0.090278
#> 5: chicken 0.0359370 0.0385186 0.034722
#> 6: citrus 0.0315465 0.0347424 0.024306
#> 7: pasta 0.0262362 0.0293635 0.024306
#> 8: cheddar 0.0234738 0.0277968 0.020833
#> 9: goat_cheese 0.0222012 0.0206934 0.020833
#> 10: zucchini 0.0206511 0.0229770 0.017361
#> 11: cornmeal 0.0200219 0.0214049 0.017361
#> 12: turkey 0.0168494 0.0163138 0.013889
#> 13: bell_pepper 0.0166124 0.0197995 0.017361
#> 14: cranberry 0.0151880 0.0196194 0.013889
#> 15: chickpea 0.0146960 0.0241223 0.017361
#> 16: orange 0.0143997 0.0097460 0.013889
#> 17: shallot 0.0142563 0.0147837 0.010417
#> 18: salad 0.0138324 0.0019759 0.013889
#> 19: coriander 0.0122258 0.0195144 0.013889
#> 20: radish 0.0103198 0.0139197 0.013889
#> 21: avocado 0.0098820 0.0146946 0.010417
#> 22: tomato 0.0098339 0.0007116 0.010417
#> 23: artichoke 0.0098065 0.0146618 0.010417
#> 24: cabbage 0.0096424 0.0106856 0.010417
#> 25: poultry 0.0093738 0.0098678 0.010417
#> 26: butter 0.0090461 0.0098878 0.006944
#> 27: parsley 0.0085243 0.0063144 0.010417
#> 28: raspberry 0.0083538 0.0135184 0.010417
#> 29: corn 0.0078651 0.0142280 0.010417
#> 30: feta 0.0076253 0.0099729 0.006944
#> 31: beef_tenderloin 0.0075823 0.0096208 0.006944
#> 32: cookie 0.0075234 0.0086715 0.006944
#> 33: pear 0.0074758 0.0099982 0.006944
#> 34: mussel 0.0073711 0.0064123 0.006944
#> 35: walnut 0.0069874 0.0072764 0.006944
#> 36: rice 0.0069834 0.0071474 0.006944
#> 37: strawberry 0.0069819 0.0069726 0.006944
#> 38: prosciutto 0.0067572 0.0098663 0.006944
#> 39: broccoli 0.0067342 0.0099239 0.006944
#> 40: ham 0.0063706 0.0062454 0.006944
#> 41: fig 0.0061310 0.0094959 0.006944
#> 42: bean 0.0060533 0.0091965 0.006944
#> 43: chestnut 0.0057221 0.0099680 0.006944
#> 44: banana 0.0057032 0.0095698 0.006944
#> 45: cilantro 0.0053733 0.0096860 0.006944
#> 46: blueberry 0.0040459 0.0048886 0.003472
#> 47: green_onion_scallion 0.0037643 0.0049068 0.003472
#> 48: mushroom 0.0037224 0.0036043 0.003472
#> 49: raisin 0.0036582 0.0033373 0.003472
#> 50: pea 0.0036520 0.0037169 0.003472
#> 51: anise 0.0036353 0.0036086 0.003472
#> 52: cherry 0.0036059 0.0049097 0.003472
#> 53: ginger 0.0035655 0.0001217 0.003472
#> 54: potato 0.0035259 0.0049294 0.003472
#> 55: pork_chop 0.0035203 0.0049081 0.003472
#> 56: hot_pepper 0.0034888 0.0047202 0.003472
#> 57: prune 0.0034452 0.0050029 0.003472
#> 58: cinnamon 0.0034398 0.0003352 0.003472
#> 59: cream_cheese 0.0033880 0.0023902 0.003472
#> 60: thyme 0.0033171 0.0006069 0.003472
#> 61: mango 0.0032992 0.0047720 0.003472
#> 62: rosemary 0.0032800 0.0048986 0.003472
#> 63: berry 0.0032799 0.0048929 0.003472
#> 64: basil 0.0032637 0.0008310 0.003472
#> 65: scallop 0.0031938 0.0039286 0.003472
#> 66: pistachio 0.0031821 0.0005749 0.003472
#> 67: arugula 0.0031112 0.0010915 0.003472
#> 68: beet 0.0030993 0.0048253 0.003472
#> 69: blue_cheese 0.0030886 0.0049777 0.003472
#> 70: jicama 0.0029381 0.0048975 0.003472
#> 71: spinach 0.0029021 0.0048763 0.003472
#> 72: curry 0.0026563 0.0049482 0.003472
#> 73: sour_cream 0.0025272 0.0049160 0.003472
#> 74: pastry 0.0024561 0.0048843 0.003472
#> 75: pecan 0.0024359 0.0034765 0.003472
#> 76: ground_beef 0.0019944 0.0046999 0.003472
#> 77: kale 0.0016813 0.0049237 0.003472
#> 78: pork 0.0007755 0.0004588 0.003472
#> Feature Gain Cover Frequency
xgb.ggplot.importance(importance_matrix = xgb_importance_matrix) +
labs(title = "XGBoost Feature Importance")